From e477fde9d55c8c8e6188997f4f2ea67b08e980d4 Mon Sep 17 00:00:00 2001
From: liyin <liyin@xiaomi.com>
Date: Wed, 20 Mar 2019 16:51:03 +0800
Subject: [PATCH] Support padding for 1x1

---
 mace/core/buffer.h                            |   2 +-
 mace/ops/arm/activation_neon.cc               |   2 +-
 mace/ops/arm/activation_neon.h                |   2 +-
 mace/ops/arm/common_neon.h                    |   2 +-
 mace/ops/arm/conv_2d_neon.h                   | 109 ---
 mace/ops/arm/conv_2d_neon_15x1.cc             | 161 ----
 mace/ops/arm/conv_2d_neon_1x15.cc             | 147 ----
 mace/ops/arm/conv_2d_neon_1x7.cc              | 251 ------
 mace/ops/arm/conv_2d_neon_7x1.cc              | 291 -------
 mace/ops/arm/deconv_2d_neon.h                 |   2 +-
 mace/ops/arm/deconv_2d_neon_2x2.cc            |   2 +-
 mace/ops/arm/deconv_2d_neon_3x3.cc            |   2 +-
 mace/ops/arm/deconv_2d_neon_4x4.cc            |   2 +-
 mace/ops/arm/depthwise_conv2d_neon.h          |   2 +-
 mace/ops/arm/depthwise_conv2d_neon_3x3.cc     |   2 +-
 mace/ops/arm/depthwise_deconv2d_neon.h        |   2 +-
 mace/ops/arm/depthwise_deconv2d_neon_3x3.cc   |   2 +-
 mace/ops/arm/depthwise_deconv2d_neon_4x4.cc   |   2 +-
 mace/ops/arm/fp32/conv_2d.cc                  | 247 ++++++
 mace/ops/arm/fp32/conv_2d.h                   |  46 +-
 mace/ops/arm/fp32/conv_2d_1x1.cc              |  63 +-
 mace/ops/arm/fp32/conv_2d_1x1.h               |   6 +-
 mace/ops/arm/fp32/conv_2d_1xn.cc              | 821 ++++++++++++++++++
 mace/ops/arm/fp32/conv_2d_1xn.h               |  86 ++
 .../conv_2d_3x3.cc}                           | 172 ++--
 mace/ops/arm/fp32/conv_2d_3x3.h               |  60 ++
 mace/ops/arm/fp32/conv_2d_3x3_winograd.cc     | 111 +--
 mace/ops/arm/fp32/conv_2d_3x3_winograd.h      |  18 +-
 .../conv_2d_5x5.cc}                           |  98 ++-
 mace/ops/arm/fp32/conv_2d_5x5.h               |  48 +
 .../conv_2d_7x7.cc}                           | 267 +++---
 mace/ops/arm/fp32/conv_2d_7x7.h               |  73 ++
 mace/ops/arm/fp32/conv_general.cc             | 232 +++++
 mace/ops/arm/fp32/conv_general.h              |  50 ++
 mace/ops/arm/fp32/gemm.h                      |   2 +-
 mace/ops/arm/fp32/gemv.h                      |   2 +-
 mace/ops/arm/q8/eltwise.h                     |   2 +-
 mace/ops/arm/q8/gemv.h                        |   2 +-
 mace/ops/conv_2d.cc                           | 590 ++-----------
 mace/ops/ref/conv_2d.cc                       |  52 +-
 mace/ops/ref/conv_2d.h                        |  48 +-
 .../tools/converter_tool/transformer.py       |   3 +-
 42 files changed, 2275 insertions(+), 1809 deletions(-)
 delete mode 100644 mace/ops/arm/conv_2d_neon.h
 delete mode 100644 mace/ops/arm/conv_2d_neon_15x1.cc
 delete mode 100644 mace/ops/arm/conv_2d_neon_1x15.cc
 delete mode 100644 mace/ops/arm/conv_2d_neon_1x7.cc
 delete mode 100644 mace/ops/arm/conv_2d_neon_7x1.cc
 create mode 100644 mace/ops/arm/fp32/conv_2d.cc
 create mode 100644 mace/ops/arm/fp32/conv_2d_1xn.cc
 create mode 100644 mace/ops/arm/fp32/conv_2d_1xn.h
 rename mace/ops/arm/{conv_2d_neon_3x3.cc => fp32/conv_2d_3x3.cc} (85%)
 create mode 100644 mace/ops/arm/fp32/conv_2d_3x3.h
 rename mace/ops/arm/{conv_2d_neon_5x5.cc => fp32/conv_2d_5x5.cc} (77%)
 create mode 100644 mace/ops/arm/fp32/conv_2d_5x5.h
 rename mace/ops/arm/{conv_2d_neon_7x7.cc => fp32/conv_2d_7x7.cc} (78%)
 create mode 100644 mace/ops/arm/fp32/conv_2d_7x7.h
 create mode 100644 mace/ops/arm/fp32/conv_general.cc
 create mode 100644 mace/ops/arm/fp32/conv_general.h

diff --git a/mace/core/buffer.h b/mace/core/buffer.h
index 418da27d..d1f5f1a5 100644
--- a/mace/core/buffer.h
+++ b/mace/core/buffer.h
@@ -503,7 +503,7 @@ class ScratchBuffer: public Buffer {
   virtual ~ScratchBuffer() {}
 
   MaceStatus GrowSize(const index_t size) {
-    if (size > size_) {
+    if (offset_ + size > size_) {
       VLOG(1) << "Grow scratch size to: " << size;
       MACE_CHECK(offset_ == 0, "scratch is being used, cannot grow size");
       return Resize(size);
diff --git a/mace/ops/arm/activation_neon.cc b/mace/ops/arm/activation_neon.cc
index 6010d714..09cfd8d4 100644
--- a/mace/ops/arm/activation_neon.cc
+++ b/mace/ops/arm/activation_neon.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 The MACE Authors. All Rights Reserved.
+// Copyright 2019 The MACE Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/mace/ops/arm/activation_neon.h b/mace/ops/arm/activation_neon.h
index a61b974b..d640e689 100644
--- a/mace/ops/arm/activation_neon.h
+++ b/mace/ops/arm/activation_neon.h
@@ -1,4 +1,4 @@
-// Copyright 2018 The MACE Authors. All Rights Reserved.
+// Copyright 2019 The MACE Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/mace/ops/arm/common_neon.h b/mace/ops/arm/common_neon.h
index c3451ea0..8d28f558 100644
--- a/mace/ops/arm/common_neon.h
+++ b/mace/ops/arm/common_neon.h
@@ -1,4 +1,4 @@
-// Copyright 2018 The MACE Authors. All Rights Reserved.
+// Copyright 2019 The MACE Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/mace/ops/arm/conv_2d_neon.h b/mace/ops/arm/conv_2d_neon.h
deleted file mode 100644
index b1fbd858..00000000
--- a/mace/ops/arm/conv_2d_neon.h
+++ /dev/null
@@ -1,109 +0,0 @@
-// Copyright 2018 The MACE Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef MACE_OPS_ARM_CONV_2D_NEON_H_
-#define MACE_OPS_ARM_CONV_2D_NEON_H_
-
-#include "mace/core/types.h"
-
-namespace mace {
-namespace ops {
-
-void Conv2dNeonK3x3S1(const float *input,
-                      const float *filter,
-                      const index_t *in_shape,
-                      const index_t *out_shape,
-                      float *output);
-
-void Conv2dNeonK3x3S2(const float *input,
-                      const float *filter,
-                      const index_t *in_shape,
-                      const index_t *out_shape,
-                      float *output);
-
-void Conv2dNeonK5x5S1(const float *input,
-                      const float *filter,
-                      const index_t *in_shape,
-                      const index_t *out_shape,
-                      float *output);
-
-void Conv2dNeonK1x7S1(const float *input,
-                      const float *filter,
-                      const index_t *in_shape,
-                      const index_t *out_shape,
-                      float *output);
-
-void Conv2dNeonK7x1S1(const float *input,
-                      const float *filter,
-                      const index_t *in_shape,
-                      const index_t *out_shape,
-                      float *output);
-
-void Conv2dNeonK7x7S1(const float *input,
-                      const float *filter,
-                      const index_t *in_shape,
-                      const index_t *out_shape,
-                      float *output);
-
-void Conv2dNeonK7x7S2(const float *input,
-                      const float *filter,
-                      const index_t *in_shape,
-                      const index_t *out_shape,
-                      float *output);
-
-void Conv2dNeonK7x7S3(const float *input,
-                      const float *filter,
-                      const index_t *in_shape,
-                      const index_t *out_shape,
-                      float *output);
-
-void Conv2dNeonK1x15S1(const float *input,
-                       const float *filter,
-                       const index_t *in_shape,
-                       const index_t *out_shape,
-                       float *output);
-
-void Conv2dNeonK15x1S1(const float *input,
-                       const float *filter,
-                       const index_t *in_shape,
-                       const index_t *out_shape,
-                       float *output);
-
-// calculate one output channel and one input channel
-inline void Conv2dCPUKHxKWCalc(const float *in_ptr,
-                               const float *filter_ptr,
-                               const index_t in_width,
-                               const index_t filter_height,
-                               const index_t filter_width,
-                               const index_t out_height,
-                               const index_t out_width,
-                               float *out_ptr,
-                               const int stride) {
-  for (index_t h = 0; h < out_height; ++h) {
-    for (index_t w = 0; w < out_width; ++w) {
-      for (int i = 0; i < filter_height; ++i) {
-        for (int j = 0; j < filter_width; ++j) {
-          out_ptr[h * out_width + w] +=
-              in_ptr[(h * stride + i) * in_width + (w * stride + j)] *
-              filter_ptr[i * filter_width + j];
-        }
-      }
-    }
-  }
-}
-
-}  // namespace ops
-}  // namespace mace
-
-#endif  // MACE_OPS_ARM_CONV_2D_NEON_H_
diff --git a/mace/ops/arm/conv_2d_neon_15x1.cc b/mace/ops/arm/conv_2d_neon_15x1.cc
deleted file mode 100644
index 5cd58fcc..00000000
--- a/mace/ops/arm/conv_2d_neon_15x1.cc
+++ /dev/null
@@ -1,161 +0,0 @@
-// Copyright 2018 The MACE Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#if defined(MACE_ENABLE_NEON)
-#include <arm_neon.h>
-#endif
-
-#include "mace/ops/arm/conv_2d_neon.h"
-#include "mace/utils/math.h"
-
-namespace mace {
-namespace ops {
-
-inline void Conv2dCPUK15x1Calc(const float *in_ptr,
-                               const float *filter_ptr,
-                               const index_t in_width,
-                               const index_t in_channels,
-                               const index_t out_height,
-                               const index_t out_width,
-                               const index_t w,
-                               const index_t tile_width,
-                               const index_t out_image_size,
-                               float *out_ptr,
-                               const index_t io,
-                               const int stride) {
-  for (index_t ih = 0; ih < out_height; ++ih) {
-    for (index_t iw = 0; iw < tile_width && w + iw < out_width; ++iw) {
-      for (int i = 0; i < 15; ++i) {
-        for (int j = 0; j < 1; ++j) {
-          out_ptr[io * out_image_size + ih * out_width + w + iw] +=
-              in_ptr[(ih * stride + i) * in_width + ((w + iw) * stride + j)] *
-              filter_ptr[io * in_channels * 15 + i * 1 + j];
-        }
-      }
-    }
-  }
-}
-
-// Ho = 4, Wo = 1, Co = 1
-void Conv2dNeonK15x1S1(const float *input,
-                       const float *filter,
-                       const index_t *in_shape,
-                       const index_t *out_shape,
-                       float *output) {
-  const index_t in_image_size = in_shape[2] * in_shape[3];
-  const index_t out_image_size = out_shape[2] * out_shape[3];
-  const index_t in_batch_size = in_shape[1] * in_image_size;
-  const index_t out_batch_size = out_shape[1] * out_image_size;
-  const index_t tile_width =
-      out_shape[1] < 4 ? RoundUpDiv4(out_shape[3]) : out_shape[3];
-
-#pragma omp parallel for collapse(3) schedule(runtime)
-  for (index_t b = 0; b < out_shape[0]; ++b) {
-    for (index_t m = 0; m < out_shape[1]; ++m) {
-      for (index_t w = 0; w < out_shape[3]; w += tile_width) {
-        const index_t out_height = out_shape[2];
-        const index_t out_width = out_shape[3];
-        const index_t in_channels = in_shape[1];
-        const index_t in_width = in_shape[3];
-        float *out_ptr_base = output + b * out_batch_size + m * out_image_size;
-        for (index_t c = 0; c < in_channels; ++c) {
-          const float *in_ptr_base =
-              input + b * in_batch_size + c * in_image_size;
-          const float *filter_ptr = filter + m * in_channels * 15 + c * 15;
-#if defined(MACE_ENABLE_NEON) && !defined(__aarch64__)
-          /* load filter (1 outch x 4 height x 1 width) */
-          float32x4_t vf0, vf1, vf2, vf3;
-          vf0 = vld1q_f32(filter_ptr);
-          vf1 = vld1q_f32(filter_ptr + 4);
-          vf2 = vld1q_f32(filter_ptr + 8);
-          vf3 = vld1q_f32(filter_ptr + 11);
-
-          for (index_t h = 0; h + 3 < out_height; h += 4) {
-            for (index_t wt = 0; wt < tile_width && w + wt < out_width; ++wt) {
-              // load output
-              index_t out_offset = h * out_width + w + wt;
-              // output (1 outch x 4 height x 1 width): vo_outch_height
-              float32x4_t vo = {out_ptr_base[out_offset],
-                                out_ptr_base[out_offset + out_width],
-                                out_ptr_base[out_offset + 2 * out_width],
-                                out_ptr_base[out_offset + 3 * out_width]};
-
-              // input offset
-              index_t in_offset = h * in_width + w + wt;
-              // input (3 slide)
-              float32x4_t vi0 = {in_ptr_base[in_offset],
-                                 in_ptr_base[in_offset + in_width],
-                                 in_ptr_base[in_offset + 2 * in_width],
-                                 in_ptr_base[in_offset + 3 * in_width]};
-              float32x4_t vi4 = {in_ptr_base[in_offset + 4 * in_width],
-                                 in_ptr_base[in_offset + 5 * in_width],
-                                 in_ptr_base[in_offset + 6 * in_width],
-                                 in_ptr_base[in_offset + 7 * in_width]};
-              float32x4_t vi8 = {in_ptr_base[in_offset + 8 * in_width],
-                                 in_ptr_base[in_offset + 9 * in_width],
-                                 in_ptr_base[in_offset + 10 * in_width],
-                                 in_ptr_base[in_offset + 11 * in_width]};
-              float32x4_t vi12 = {in_ptr_base[in_offset + 12 * in_width],
-                                  in_ptr_base[in_offset + 13 * in_width],
-                                  in_ptr_base[in_offset + 14 * in_width],
-                                  in_ptr_base[in_offset + 15 * in_width]};
-              float32x4_t vi16 = {in_ptr_base[in_offset + 16 * in_width],
-                                  in_ptr_base[in_offset + 17 * in_width]};
-              float32x4_t vi1 = vextq_f32(vi0, vi4, 1);
-              float32x4_t vi2 = vextq_f32(vi0, vi4, 2);
-              float32x4_t vi3 = vextq_f32(vi0, vi4, 3);
-              float32x4_t vi5 = vextq_f32(vi4, vi8, 1);
-              float32x4_t vi6 = vextq_f32(vi4, vi8, 2);
-              float32x4_t vi7 = vextq_f32(vi4, vi8, 3);
-              float32x4_t vi9 = vextq_f32(vi8, vi12, 1);
-              float32x4_t vi10 = vextq_f32(vi8, vi12, 2);
-              float32x4_t vi11 = vextq_f32(vi8, vi12, 3);
-              float32x4_t vi13 = vextq_f32(vi12, vi16, 1);
-              float32x4_t vi14 = vextq_f32(vi12, vi16, 2);
-
-              vo = vmlaq_lane_f32(vo, vi0, vget_low_f32(vf0), 0);
-              vo = vmlaq_lane_f32(vo, vi1, vget_low_f32(vf0), 1);
-              vo = vmlaq_lane_f32(vo, vi2, vget_high_f32(vf0), 0);
-              vo = vmlaq_lane_f32(vo, vi3, vget_high_f32(vf0), 1);
-              vo = vmlaq_lane_f32(vo, vi4, vget_low_f32(vf1), 0);
-              vo = vmlaq_lane_f32(vo, vi5, vget_low_f32(vf1), 1);
-              vo = vmlaq_lane_f32(vo, vi6, vget_high_f32(vf1), 0);
-              vo = vmlaq_lane_f32(vo, vi7, vget_high_f32(vf1), 1);
-              vo = vmlaq_lane_f32(vo, vi8, vget_low_f32(vf2), 0);
-              vo = vmlaq_lane_f32(vo, vi9, vget_low_f32(vf2), 1);
-              vo = vmlaq_lane_f32(vo, vi10, vget_high_f32(vf2), 0);
-              vo = vmlaq_lane_f32(vo, vi11, vget_high_f32(vf2), 1);
-              vo = vmlaq_lane_f32(vo, vi12, vget_low_f32(vf3), 1);
-              vo = vmlaq_lane_f32(vo, vi13, vget_high_f32(vf3), 0);
-              vo = vmlaq_lane_f32(vo, vi14, vget_high_f32(vf3), 1);
-
-              out_ptr_base[out_offset] = vo[0];
-              out_ptr_base[out_offset + out_width] = vo[1];
-              out_ptr_base[out_offset + 2 * out_width] = vo[2];
-              out_ptr_base[out_offset + 3 * out_width] = vo[3];
-            }  // wt
-          }    // h
-#else
-          Conv2dCPUK15x1Calc(in_ptr_base, filter_ptr, in_width, in_channels,
-                             out_height, out_width, w, tile_width,
-                             out_image_size, out_ptr_base, 0, 1);
-#endif
-        }  // c
-      }    // w
-    }      // m
-  }        // b
-}
-
-}  // namespace ops
-}  // namespace mace
diff --git a/mace/ops/arm/conv_2d_neon_1x15.cc b/mace/ops/arm/conv_2d_neon_1x15.cc
deleted file mode 100644
index b8837490..00000000
--- a/mace/ops/arm/conv_2d_neon_1x15.cc
+++ /dev/null
@@ -1,147 +0,0 @@
-// Copyright 2018 The MACE Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#if defined(MACE_ENABLE_NEON)
-#include <arm_neon.h>
-#endif
-
-#include "mace/ops/arm/conv_2d_neon.h"
-#include "mace/utils/logging.h"
-#include "mace/utils/math.h"
-
-namespace mace {
-namespace ops {
-
-inline void Conv2dCPUK1x15Calc(const float *in_ptr,
-                               const float *filter_ptr,
-                               const index_t in_width,
-                               const index_t in_channels,
-                               const index_t out_height,
-                               const index_t h,
-                               const index_t tile_height,
-                               const index_t out_width,
-                               const index_t out_image_size,
-                               float *out_ptr,
-                               const index_t io,
-                               const int stride) {
-  for (index_t ih = 0; ih < tile_height && h + ih < out_height; ++ih) {
-    for (index_t iw = 0; iw < out_width; ++iw) {
-      for (int i = 0; i < 1; ++i) {
-        for (int j = 0; j < 15; ++j) {
-          out_ptr[io * out_image_size + (h + ih) * out_width + iw] +=
-              in_ptr[((h + ih) * stride + i) * in_width + (iw * stride + j)] *
-              filter_ptr[io * in_channels * 15 + i * 15 + j];
-        }
-      }
-    }
-  }
-}
-
-// Ho = 1, Wo = 4, Co = 1
-void Conv2dNeonK1x15S1(const float *input,
-                       const float *filter,
-                       const index_t *in_shape,
-                       const index_t *out_shape,
-                       float *output) {
-  const index_t in_image_size = in_shape[2] * in_shape[3];
-  const index_t out_image_size = out_shape[2] * out_shape[3];
-  const index_t in_batch_size = in_shape[1] * in_image_size;
-  const index_t out_batch_size = out_shape[1] * out_image_size;
-  const index_t tile_height =
-      out_shape[1] < 4 ? RoundUpDiv4(out_shape[2]) : out_shape[2];
-
-#pragma omp parallel for collapse(3) schedule(runtime)
-  for (index_t b = 0; b < out_shape[0]; ++b) {
-    for (index_t m = 0; m < out_shape[1]; ++m) {
-      for (index_t h = 0; h < out_shape[2]; h += tile_height) {
-        const index_t out_height = out_shape[2];
-        const index_t out_width = out_shape[3];
-        const index_t in_channels = in_shape[1];
-        const index_t in_width = in_shape[3];
-        float *out_ptr_base = output + b * out_batch_size + m * out_image_size;
-        for (index_t c = 0; c < in_channels; ++c) {
-          const float *in_ptr_base =
-              input + b * in_batch_size + c * in_image_size;
-          const float *filter_ptr = filter + m * in_channels * 15 + c * 15;
-#if defined(MACE_ENABLE_NEON) && !defined(__aarch64__)
-          /* load filter (1 outch x 4 height x 1 width) */
-          float32x4_t vf0, vf1, vf2, vf3;
-          vf0 = vld1q_f32(filter_ptr);
-          vf1 = vld1q_f32(filter_ptr + 4);
-          vf2 = vld1q_f32(filter_ptr + 8);
-          vf3 = vld1q_f32(filter_ptr + 11);
-
-          for (index_t ht = 0; ht < tile_height && h + ht < out_height; ++ht) {
-            for (index_t w = 0; w + 3 < out_width; w += 4) {
-              // output (1 outch x 1 height x 4 width): vo_outch_height
-              float32x4_t vo;
-              // load output
-              index_t out_offset = (h + ht) * out_width + w;
-              vo = vld1q_f32(out_ptr_base + out_offset);
-
-              // input (3 slide)
-              float32x4_t vi0, vi1, vi2, vi3, vi4, vi5, vi6, vi7, vi8, vi9,
-                  vi10, vi11, vi12, vi13, vi14, vi16;
-              // input offset
-              index_t in_offset = (h + ht) * in_width + w;
-              // load input
-              vi0 = vld1q_f32(in_ptr_base + in_offset);
-              vi4 = vld1q_f32(in_ptr_base + in_offset + 4);
-              vi8 = vld1q_f32(in_ptr_base + in_offset + 8);
-              vi12 = vld1q_f32(in_ptr_base + in_offset + 12);
-              vi16 = vld1q_f32(in_ptr_base + in_offset + 16);
-              vi1 = vextq_f32(vi0, vi4, 1);
-              vi2 = vextq_f32(vi0, vi4, 2);
-              vi3 = vextq_f32(vi0, vi4, 3);
-              vi5 = vextq_f32(vi4, vi8, 1);
-              vi6 = vextq_f32(vi4, vi8, 2);
-              vi7 = vextq_f32(vi4, vi8, 3);
-              vi9 = vextq_f32(vi8, vi12, 1);
-              vi10 = vextq_f32(vi8, vi12, 2);
-              vi11 = vextq_f32(vi8, vi12, 3);
-              vi13 = vextq_f32(vi12, vi16, 1);
-              vi14 = vextq_f32(vi12, vi16, 2);
-
-              vo = vmlaq_lane_f32(vo, vi0, vget_low_f32(vf0), 0);
-              vo = vmlaq_lane_f32(vo, vi1, vget_low_f32(vf0), 1);
-              vo = vmlaq_lane_f32(vo, vi2, vget_high_f32(vf0), 0);
-              vo = vmlaq_lane_f32(vo, vi3, vget_high_f32(vf0), 1);
-              vo = vmlaq_lane_f32(vo, vi4, vget_low_f32(vf1), 0);
-              vo = vmlaq_lane_f32(vo, vi5, vget_low_f32(vf1), 1);
-              vo = vmlaq_lane_f32(vo, vi6, vget_high_f32(vf1), 0);
-              vo = vmlaq_lane_f32(vo, vi7, vget_high_f32(vf1), 1);
-              vo = vmlaq_lane_f32(vo, vi8, vget_low_f32(vf2), 0);
-              vo = vmlaq_lane_f32(vo, vi9, vget_low_f32(vf2), 1);
-              vo = vmlaq_lane_f32(vo, vi10, vget_high_f32(vf2), 0);
-              vo = vmlaq_lane_f32(vo, vi11, vget_high_f32(vf2), 1);
-              vo = vmlaq_lane_f32(vo, vi12, vget_low_f32(vf3), 1);
-              vo = vmlaq_lane_f32(vo, vi13, vget_high_f32(vf3), 0);
-              vo = vmlaq_lane_f32(vo, vi14, vget_high_f32(vf3), 1);
-
-              vst1q_f32(out_ptr_base + out_offset, vo);
-            }  // w
-          }    // ht
-#else
-          Conv2dCPUK1x15Calc(in_ptr_base, filter_ptr, in_width, in_channels,
-                             out_height, h, tile_height, out_width,
-                             out_image_size, out_ptr_base, 0, 1);
-#endif
-        }  // c
-      }    // h
-    }      // m
-  }        // b
-}
-
-}  // namespace ops
-}  // namespace mace
diff --git a/mace/ops/arm/conv_2d_neon_1x7.cc b/mace/ops/arm/conv_2d_neon_1x7.cc
deleted file mode 100644
index e5e249d3..00000000
--- a/mace/ops/arm/conv_2d_neon_1x7.cc
+++ /dev/null
@@ -1,251 +0,0 @@
-// Copyright 2018 The MACE Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#if defined(MACE_ENABLE_NEON)
-#include <arm_neon.h>
-#endif
-
-#include "mace/ops/arm/conv_2d_neon.h"
-
-namespace mace {
-namespace ops {
-
-// Ho = 1, Wo = 4, Co = 4
-void Conv2dNeonK1x7S1(const float *input,
-                      const float *filter,
-                      const index_t *in_shape,
-                      const index_t *out_shape,
-                      float *output) {
-  const index_t in_image_size = in_shape[2] * in_shape[3];
-  const index_t out_image_size = out_shape[2] * out_shape[3];
-  const index_t in_batch_size = in_shape[1] * in_image_size;
-  const index_t out_batch_size = out_shape[1] * out_image_size;
-
-#pragma omp parallel for collapse(2) schedule(runtime)
-  for (index_t b = 0; b < out_shape[0]; ++b) {
-    for (index_t m = 0; m < out_shape[1]; m += 4) {
-      const index_t out_channels = out_shape[1];
-      const index_t out_height = out_shape[2];
-      const index_t out_width = out_shape[3];
-      const index_t in_channels = in_shape[1];
-      const index_t in_width = in_shape[3];
-      if (m + 3 < out_channels) {
-        float *out_ptr0_base = output + b * out_batch_size + m * out_image_size;
-#if defined(MACE_ENABLE_NEON)
-        float *out_ptr1_base =
-            output + b * out_batch_size + (m + 1) * out_image_size;
-        float *out_ptr2_base =
-            output + b * out_batch_size + (m + 2) * out_image_size;
-        float *out_ptr3_base =
-            output + b * out_batch_size + (m + 3) * out_image_size;
-#endif
-        for (index_t c = 0; c < in_channels; ++c) {
-          const float *in_ptr_base =
-              input + b * in_batch_size + c * in_image_size;
-          const float *filter_ptr0 = filter + m * in_channels * 7 + c * 7;
-#if defined(MACE_ENABLE_NEON)
-          const float *filter_ptr1 = filter + (m + 1) * in_channels * 7 + c * 7;
-          const float *filter_ptr2 = filter + (m + 2) * in_channels * 7 + c * 7;
-          const float *filter_ptr3 = filter + (m + 3) * in_channels * 7 + c * 7;
-          /* load filter (4 outch x 1 height x 4 width) */
-          float32x4_t vf00, vf01;
-          float32x4_t vf10, vf11;
-          float32x4_t vf20, vf21;
-          float32x4_t vf30, vf31;
-          vf00 = vld1q_f32(filter_ptr0);
-          vf01 = vld1q_f32(filter_ptr0 + 3);
-          vf10 = vld1q_f32(filter_ptr1);
-          vf11 = vld1q_f32(filter_ptr1 + 3);
-          vf20 = vld1q_f32(filter_ptr2);
-          vf21 = vld1q_f32(filter_ptr2 + 3);
-          vf30 = vld1q_f32(filter_ptr3);
-          vf31 = vld1q_f32(filter_ptr3 + 3);
-
-          for (index_t h = 0; h < out_height; ++h) {
-            for (index_t w = 0; w + 3 < out_width; w += 4) {
-              // output (4 outch x 1 height x 4 width): vo_outch_height
-              float32x4_t vo0, vo1, vo2, vo3;
-              // load output
-              index_t out_offset = h * out_width + w;
-              vo0 = vld1q_f32(out_ptr0_base + out_offset);
-              vo1 = vld1q_f32(out_ptr1_base + out_offset);
-              vo2 = vld1q_f32(out_ptr2_base + out_offset);
-              vo3 = vld1q_f32(out_ptr3_base + out_offset);
-
-              // input (3 slide)
-              float32x4_t vi0, vi1, vi2, vi3, vi4, vi5, vi6, vi8;
-              // input offset
-              index_t in_offset = h * in_width + w;
-              // load input
-              vi0 = vld1q_f32(in_ptr_base + in_offset);
-              vi4 = vld1q_f32(in_ptr_base + in_offset + 4);
-              vi8 = vld1q_f32(in_ptr_base + in_offset + 8);
-              vi1 = vextq_f32(vi0, vi4, 1);
-              vi2 = vextq_f32(vi0, vi4, 2);
-              vi3 = vextq_f32(vi0, vi4, 3);
-              vi5 = vextq_f32(vi4, vi8, 1);
-              vi6 = vextq_f32(vi4, vi8, 2);
-
-#if defined(__aarch64__)
-              /* outch 0 */
-              vo0 = vfmaq_laneq_f32(vo0, vi0, vf00, 0);
-              vo0 = vfmaq_laneq_f32(vo0, vi1, vf00, 1);
-              vo0 = vfmaq_laneq_f32(vo0, vi2, vf00, 2);
-              vo0 = vfmaq_laneq_f32(vo0, vi3, vf00, 3);
-              vo0 = vfmaq_laneq_f32(vo0, vi4, vf01, 1);
-              vo0 = vfmaq_laneq_f32(vo0, vi5, vf01, 2);
-              vo0 = vfmaq_laneq_f32(vo0, vi6, vf01, 3);
-              /* outch 1 */
-              vo1 = vfmaq_laneq_f32(vo1, vi0, vf10, 0);
-              vo1 = vfmaq_laneq_f32(vo1, vi1, vf10, 1);
-              vo1 = vfmaq_laneq_f32(vo1, vi2, vf10, 2);
-              vo1 = vfmaq_laneq_f32(vo1, vi3, vf10, 3);
-              vo1 = vfmaq_laneq_f32(vo1, vi4, vf11, 1);
-              vo1 = vfmaq_laneq_f32(vo1, vi5, vf11, 2);
-              vo1 = vfmaq_laneq_f32(vo1, vi6, vf11, 3);
-              /* outch 2 */
-              vo2 = vfmaq_laneq_f32(vo2, vi0, vf20, 0);
-              vo2 = vfmaq_laneq_f32(vo2, vi1, vf20, 1);
-              vo2 = vfmaq_laneq_f32(vo2, vi2, vf20, 2);
-              vo2 = vfmaq_laneq_f32(vo2, vi3, vf20, 3);
-              vo2 = vfmaq_laneq_f32(vo2, vi4, vf21, 1);
-              vo2 = vfmaq_laneq_f32(vo2, vi5, vf21, 2);
-              vo2 = vfmaq_laneq_f32(vo2, vi6, vf21, 3);
-              /* outch 3 */
-              vo3 = vfmaq_laneq_f32(vo3, vi0, vf30, 0);
-              vo3 = vfmaq_laneq_f32(vo3, vi1, vf30, 1);
-              vo3 = vfmaq_laneq_f32(vo3, vi2, vf30, 2);
-              vo3 = vfmaq_laneq_f32(vo3, vi3, vf30, 3);
-              vo3 = vfmaq_laneq_f32(vo3, vi4, vf31, 1);
-              vo3 = vfmaq_laneq_f32(vo3, vi5, vf31, 2);
-              vo3 = vfmaq_laneq_f32(vo3, vi6, vf31, 3);
-#else
-              /* outch 0 */
-              vo0 = vmlaq_lane_f32(vo0, vi0, vget_low_f32(vf00), 0);
-              vo0 = vmlaq_lane_f32(vo0, vi1, vget_low_f32(vf00), 1);
-              vo0 = vmlaq_lane_f32(vo0, vi2, vget_high_f32(vf00), 0);
-              vo0 = vmlaq_lane_f32(vo0, vi3, vget_high_f32(vf00), 1);
-              vo0 = vmlaq_lane_f32(vo0, vi4, vget_low_f32(vf01), 1);
-              vo0 = vmlaq_lane_f32(vo0, vi5, vget_high_f32(vf01), 0);
-              vo0 = vmlaq_lane_f32(vo0, vi6, vget_high_f32(vf01), 1);
-              /* outch 1 */
-              vo1 = vmlaq_lane_f32(vo1, vi0, vget_low_f32(vf10), 0);
-              vo1 = vmlaq_lane_f32(vo1, vi1, vget_low_f32(vf10), 1);
-              vo1 = vmlaq_lane_f32(vo1, vi2, vget_high_f32(vf10), 0);
-              vo1 = vmlaq_lane_f32(vo1, vi3, vget_high_f32(vf10), 1);
-              vo1 = vmlaq_lane_f32(vo1, vi4, vget_low_f32(vf11), 1);
-              vo1 = vmlaq_lane_f32(vo1, vi5, vget_high_f32(vf11), 0);
-              vo1 = vmlaq_lane_f32(vo1, vi6, vget_high_f32(vf11), 1);
-              /* outch 2 */
-              vo2 = vmlaq_lane_f32(vo2, vi0, vget_low_f32(vf20), 0);
-              vo2 = vmlaq_lane_f32(vo2, vi1, vget_low_f32(vf20), 1);
-              vo2 = vmlaq_lane_f32(vo2, vi2, vget_high_f32(vf20), 0);
-              vo2 = vmlaq_lane_f32(vo2, vi3, vget_high_f32(vf20), 1);
-              vo2 = vmlaq_lane_f32(vo2, vi4, vget_low_f32(vf21), 1);
-              vo2 = vmlaq_lane_f32(vo2, vi5, vget_high_f32(vf21), 0);
-              vo2 = vmlaq_lane_f32(vo2, vi6, vget_high_f32(vf21), 1);
-              /* outch 3 */
-              vo3 = vmlaq_lane_f32(vo3, vi0, vget_low_f32(vf30), 0);
-              vo3 = vmlaq_lane_f32(vo3, vi1, vget_low_f32(vf30), 1);
-              vo3 = vmlaq_lane_f32(vo3, vi2, vget_high_f32(vf30), 0);
-              vo3 = vmlaq_lane_f32(vo3, vi3, vget_high_f32(vf30), 1);
-              vo3 = vmlaq_lane_f32(vo3, vi4, vget_low_f32(vf31), 1);
-              vo3 = vmlaq_lane_f32(vo3, vi5, vget_high_f32(vf31), 0);
-              vo3 = vmlaq_lane_f32(vo3, vi6, vget_high_f32(vf31), 1);
-#endif
-
-              vst1q_f32(out_ptr0_base + out_offset, vo0);
-              vst1q_f32(out_ptr1_base + out_offset, vo1);
-              vst1q_f32(out_ptr2_base + out_offset, vo2);
-              vst1q_f32(out_ptr3_base + out_offset, vo3);
-            }  // w
-          }    // h
-#else
-          for (index_t oc = 0; oc < 4; ++oc) {
-            Conv2dCPUKHxKWCalc(in_ptr_base, filter_ptr0 + oc * in_channels * 7,
-                               in_width, 1, 7, out_height, out_width,
-                               out_ptr0_base + oc * out_image_size, 1);
-          }
-#endif
-        }  // c
-      } else {
-        for (index_t mm = m; mm < out_channels; ++mm) {
-          float *out_ptr0_base =
-              output + b * out_batch_size + mm * out_image_size;
-          for (index_t c = 0; c < in_channels; ++c) {
-            const float *in_ptr_base =
-                input + b * in_batch_size + c * in_image_size;
-            const float *filter_ptr0 = filter + mm * in_channels * 7 + c * 7;
-#if defined(MACE_ENABLE_NEON)
-            /* load filter (1 outch x 1 height x 4 width) */
-            float32x4_t vf00, vf01;
-            vf00 = vld1q_f32(filter_ptr0);
-            vf01 = vld1q_f32(filter_ptr0 + 3);
-
-            for (index_t h = 0; h < out_height; ++h) {
-              for (index_t w = 0; w + 3 < out_width; w += 4) {
-                // output (1 outch x 1 height x 4 width): vo_outch_height
-                float32x4_t vo0;
-                // load output
-                index_t out_offset = h * out_width + w;
-                vo0 = vld1q_f32(out_ptr0_base + out_offset);
-
-                // input (3 slide)
-                float32x4_t vi0, vi1, vi2, vi3, vi4, vi5, vi6, vi8;
-                // input offset
-                index_t in_offset = h * in_width + w;
-                // load input
-                vi0 = vld1q_f32(in_ptr_base + in_offset);
-                vi4 = vld1q_f32(in_ptr_base + in_offset + 4);
-                vi8 = vld1q_f32(in_ptr_base + in_offset + 8);
-                vi1 = vextq_f32(vi0, vi4, 1);
-                vi2 = vextq_f32(vi0, vi4, 2);
-                vi3 = vextq_f32(vi0, vi4, 3);
-                vi5 = vextq_f32(vi4, vi8, 1);
-                vi6 = vextq_f32(vi4, vi8, 2);
-
-#if defined(__aarch64__)
-                vo0 = vfmaq_laneq_f32(vo0, vi0, vf00, 0);
-                vo0 = vfmaq_laneq_f32(vo0, vi1, vf00, 1);
-                vo0 = vfmaq_laneq_f32(vo0, vi2, vf00, 2);
-                vo0 = vfmaq_laneq_f32(vo0, vi3, vf00, 3);
-                vo0 = vfmaq_laneq_f32(vo0, vi4, vf01, 1);
-                vo0 = vfmaq_laneq_f32(vo0, vi5, vf01, 2);
-                vo0 = vfmaq_laneq_f32(vo0, vi6, vf01, 3);
-#else
-                vo0 = vmlaq_lane_f32(vo0, vi0, vget_low_f32(vf00), 0);
-                vo0 = vmlaq_lane_f32(vo0, vi1, vget_low_f32(vf00), 1);
-                vo0 = vmlaq_lane_f32(vo0, vi2, vget_high_f32(vf00), 0);
-                vo0 = vmlaq_lane_f32(vo0, vi3, vget_high_f32(vf00), 1);
-                vo0 = vmlaq_lane_f32(vo0, vi4, vget_low_f32(vf01), 1);
-                vo0 = vmlaq_lane_f32(vo0, vi5, vget_high_f32(vf01), 0);
-                vo0 = vmlaq_lane_f32(vo0, vi6, vget_high_f32(vf01), 1);
-#endif
-
-                vst1q_f32(out_ptr0_base + out_offset, vo0);
-              }  // w
-            }    // h
-#else
-            Conv2dCPUKHxKWCalc(in_ptr_base, filter_ptr0, in_width, 1, 7,
-                               out_height, out_width, out_ptr0_base, 1);
-#endif
-          }  // c
-        }
-      }  // if
-    }    // m
-  }      // b
-}
-
-}  // namespace ops
-}  // namespace mace
diff --git a/mace/ops/arm/conv_2d_neon_7x1.cc b/mace/ops/arm/conv_2d_neon_7x1.cc
deleted file mode 100644
index 7aa9309b..00000000
--- a/mace/ops/arm/conv_2d_neon_7x1.cc
+++ /dev/null
@@ -1,291 +0,0 @@
-// Copyright 2018 The MACE Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#if defined(MACE_ENABLE_NEON)
-#include <arm_neon.h>
-#endif
-
-#include "mace/ops/arm/conv_2d_neon.h"
-
-namespace mace {
-namespace ops {
-
-// Ho = 4, Wo = 1, Co = 4
-void Conv2dNeonK7x1S1(const float *input,
-                      const float *filter,
-                      const index_t *in_shape,
-                      const index_t *out_shape,
-                      float *output) {
-  const index_t in_image_size = in_shape[2] * in_shape[3];
-  const index_t out_image_size = out_shape[2] * out_shape[3];
-  const index_t in_batch_size = in_shape[1] * in_image_size;
-  const index_t out_batch_size = out_shape[1] * out_image_size;
-
-#pragma omp parallel for collapse(2) schedule(runtime)
-  for (index_t b = 0; b < out_shape[0]; ++b) {
-    for (index_t m = 0; m < out_shape[1]; m += 4) {
-      const index_t out_channels = out_shape[1];
-      const index_t out_height = out_shape[2];
-      const index_t out_width = out_shape[3];
-      const index_t in_channels = in_shape[1];
-      const index_t in_width = in_shape[3];
-      if (m + 3 < out_channels) {
-        float *out_ptr0_base = output + b * out_batch_size + m * out_image_size;
-#if defined(MACE_ENABLE_NEON)
-        float *out_ptr1_base =
-            output + b * out_batch_size + (m + 1) * out_image_size;
-        float *out_ptr2_base =
-            output + b * out_batch_size + (m + 2) * out_image_size;
-        float *out_ptr3_base =
-            output + b * out_batch_size + (m + 3) * out_image_size;
-#endif
-        for (index_t c = 0; c < in_channels; ++c) {
-          const float *in_ptr_base =
-              input + b * in_batch_size + c * in_image_size;
-          const float *filter_ptr0 = filter + m * in_channels * 7 + c * 7;
-#if defined(MACE_ENABLE_NEON)
-          const float *filter_ptr1 = filter + (m + 1) * in_channels * 7 + c * 7;
-          const float *filter_ptr2 = filter + (m + 2) * in_channels * 7 + c * 7;
-          const float *filter_ptr3 = filter + (m + 3) * in_channels * 7 + c * 7;
-          /* load filter (4 outch x 4 height x 1 width) */
-          float32x4_t vf00, vf01;
-          float32x4_t vf10, vf11;
-          float32x4_t vf20, vf21;
-          float32x4_t vf30, vf31;
-          vf00 = vld1q_f32(filter_ptr0);
-          vf01 = vld1q_f32(filter_ptr0 + 3);
-          vf10 = vld1q_f32(filter_ptr1);
-          vf11 = vld1q_f32(filter_ptr1 + 3);
-          vf20 = vld1q_f32(filter_ptr2);
-          vf21 = vld1q_f32(filter_ptr2 + 3);
-          vf30 = vld1q_f32(filter_ptr3);
-          vf31 = vld1q_f32(filter_ptr3 + 3);
-
-          for (index_t h = 0; h + 3 < out_height; h += 4) {
-            for (index_t w = 0; w < out_width; ++w) {
-              // load output
-              index_t out_offset = h * out_width + w;
-              // output (4 outch x 4 height x 1 width): vo_outch_height
-              float32x4_t vo0 = {out_ptr0_base[out_offset],
-                                 out_ptr0_base[out_offset + out_width],
-                                 out_ptr0_base[out_offset + 2 * out_width],
-                                 out_ptr0_base[out_offset + 3 * out_width]};
-              float32x4_t vo1 = {out_ptr1_base[out_offset],
-                                 out_ptr1_base[out_offset + out_width],
-                                 out_ptr1_base[out_offset + 2 * out_width],
-                                 out_ptr1_base[out_offset + 3 * out_width]};
-              float32x4_t vo2 = {out_ptr2_base[out_offset],
-                                 out_ptr2_base[out_offset + out_width],
-                                 out_ptr2_base[out_offset + 2 * out_width],
-                                 out_ptr2_base[out_offset + 3 * out_width]};
-              float32x4_t vo3 = {out_ptr3_base[out_offset],
-                                 out_ptr3_base[out_offset + out_width],
-                                 out_ptr3_base[out_offset + 2 * out_width],
-                                 out_ptr3_base[out_offset + 3 * out_width]};
-
-              // input offset
-              index_t in_offset = h * in_width + w;
-              // input (3 slide)
-              float32x4_t vi0 = {in_ptr_base[in_offset],
-                                 in_ptr_base[in_offset + in_width],
-                                 in_ptr_base[in_offset + 2 * in_width],
-                                 in_ptr_base[in_offset + 3 * in_width]};
-              float32x4_t vi4 = {in_ptr_base[in_offset + 4 * in_width],
-                                 in_ptr_base[in_offset + 5 * in_width],
-                                 in_ptr_base[in_offset + 6 * in_width],
-                                 in_ptr_base[in_offset + 7 * in_width]};
-              float32x4_t vi8 = {in_ptr_base[in_offset + 8 * in_width],
-                                 in_ptr_base[in_offset + 9 * in_width]};
-              float32x4_t vi1 = vextq_f32(vi0, vi4, 1);
-              float32x4_t vi2 = vextq_f32(vi0, vi4, 2);
-              float32x4_t vi3 = vextq_f32(vi0, vi4, 3);
-              float32x4_t vi5 = vextq_f32(vi4, vi8, 1);
-              float32x4_t vi6 = vextq_f32(vi4, vi8, 2);
-
-#if defined(__aarch64__)
-              /* outch 0 */
-              vo0 = vfmaq_laneq_f32(vo0, vi0, vf00, 0);
-              vo0 = vfmaq_laneq_f32(vo0, vi1, vf00, 1);
-              vo0 = vfmaq_laneq_f32(vo0, vi2, vf00, 2);
-              vo0 = vfmaq_laneq_f32(vo0, vi3, vf00, 3);
-              vo0 = vfmaq_laneq_f32(vo0, vi4, vf01, 1);
-              vo0 = vfmaq_laneq_f32(vo0, vi5, vf01, 2);
-              vo0 = vfmaq_laneq_f32(vo0, vi6, vf01, 3);
-              /* outch 1 */
-              vo1 = vfmaq_laneq_f32(vo1, vi0, vf10, 0);
-              vo1 = vfmaq_laneq_f32(vo1, vi1, vf10, 1);
-              vo1 = vfmaq_laneq_f32(vo1, vi2, vf10, 2);
-              vo1 = vfmaq_laneq_f32(vo1, vi3, vf10, 3);
-              vo1 = vfmaq_laneq_f32(vo1, vi4, vf11, 1);
-              vo1 = vfmaq_laneq_f32(vo1, vi5, vf11, 2);
-              vo1 = vfmaq_laneq_f32(vo1, vi6, vf11, 3);
-              /* outch 2 */
-              vo2 = vfmaq_laneq_f32(vo2, vi0, vf20, 0);
-              vo2 = vfmaq_laneq_f32(vo2, vi1, vf20, 1);
-              vo2 = vfmaq_laneq_f32(vo2, vi2, vf20, 2);
-              vo2 = vfmaq_laneq_f32(vo2, vi3, vf20, 3);
-              vo2 = vfmaq_laneq_f32(vo2, vi4, vf21, 1);
-              vo2 = vfmaq_laneq_f32(vo2, vi5, vf21, 2);
-              vo2 = vfmaq_laneq_f32(vo2, vi6, vf21, 3);
-              /* outch 3 */
-              vo3 = vfmaq_laneq_f32(vo3, vi0, vf30, 0);
-              vo3 = vfmaq_laneq_f32(vo3, vi1, vf30, 1);
-              vo3 = vfmaq_laneq_f32(vo3, vi2, vf30, 2);
-              vo3 = vfmaq_laneq_f32(vo3, vi3, vf30, 3);
-              vo3 = vfmaq_laneq_f32(vo3, vi4, vf31, 1);
-              vo3 = vfmaq_laneq_f32(vo3, vi5, vf31, 2);
-              vo3 = vfmaq_laneq_f32(vo3, vi6, vf31, 3);
-#else
-              /* outch 0 */
-              vo0 = vmlaq_lane_f32(vo0, vi0, vget_low_f32(vf00), 0);
-              vo0 = vmlaq_lane_f32(vo0, vi1, vget_low_f32(vf00), 1);
-              vo0 = vmlaq_lane_f32(vo0, vi2, vget_high_f32(vf00), 0);
-              vo0 = vmlaq_lane_f32(vo0, vi3, vget_high_f32(vf00), 1);
-              vo0 = vmlaq_lane_f32(vo0, vi4, vget_low_f32(vf01), 1);
-              vo0 = vmlaq_lane_f32(vo0, vi5, vget_high_f32(vf01), 0);
-              vo0 = vmlaq_lane_f32(vo0, vi6, vget_high_f32(vf01), 1);
-              /* outch 1 */
-              vo1 = vmlaq_lane_f32(vo1, vi0, vget_low_f32(vf10), 0);
-              vo1 = vmlaq_lane_f32(vo1, vi1, vget_low_f32(vf10), 1);
-              vo1 = vmlaq_lane_f32(vo1, vi2, vget_high_f32(vf10), 0);
-              vo1 = vmlaq_lane_f32(vo1, vi3, vget_high_f32(vf10), 1);
-              vo1 = vmlaq_lane_f32(vo1, vi4, vget_low_f32(vf11), 1);
-              vo1 = vmlaq_lane_f32(vo1, vi5, vget_high_f32(vf11), 0);
-              vo1 = vmlaq_lane_f32(vo1, vi6, vget_high_f32(vf11), 1);
-              /* outch 2 */
-              vo2 = vmlaq_lane_f32(vo2, vi0, vget_low_f32(vf20), 0);
-              vo2 = vmlaq_lane_f32(vo2, vi1, vget_low_f32(vf20), 1);
-              vo2 = vmlaq_lane_f32(vo2, vi2, vget_high_f32(vf20), 0);
-              vo2 = vmlaq_lane_f32(vo2, vi3, vget_high_f32(vf20), 1);
-              vo2 = vmlaq_lane_f32(vo2, vi4, vget_low_f32(vf21), 1);
-              vo2 = vmlaq_lane_f32(vo2, vi5, vget_high_f32(vf21), 0);
-              vo2 = vmlaq_lane_f32(vo2, vi6, vget_high_f32(vf21), 1);
-              /* outch 3 */
-              vo3 = vmlaq_lane_f32(vo3, vi0, vget_low_f32(vf30), 0);
-              vo3 = vmlaq_lane_f32(vo3, vi1, vget_low_f32(vf30), 1);
-              vo3 = vmlaq_lane_f32(vo3, vi2, vget_high_f32(vf30), 0);
-              vo3 = vmlaq_lane_f32(vo3, vi3, vget_high_f32(vf30), 1);
-              vo3 = vmlaq_lane_f32(vo3, vi4, vget_low_f32(vf31), 1);
-              vo3 = vmlaq_lane_f32(vo3, vi5, vget_high_f32(vf31), 0);
-              vo3 = vmlaq_lane_f32(vo3, vi6, vget_high_f32(vf31), 1);
-#endif
-
-              out_ptr0_base[out_offset] = vo0[0];
-              out_ptr0_base[out_offset + out_width] = vo0[1];
-              out_ptr0_base[out_offset + 2 * out_width] = vo0[2];
-              out_ptr0_base[out_offset + 3 * out_width] = vo0[3];
-              out_ptr1_base[out_offset] = vo1[0];
-              out_ptr1_base[out_offset + out_width] = vo1[1];
-              out_ptr1_base[out_offset + 2 * out_width] = vo1[2];
-              out_ptr1_base[out_offset + 3 * out_width] = vo1[3];
-              out_ptr2_base[out_offset] = vo2[0];
-              out_ptr2_base[out_offset + out_width] = vo2[1];
-              out_ptr2_base[out_offset + 2 * out_width] = vo2[2];
-              out_ptr2_base[out_offset + 3 * out_width] = vo2[3];
-              out_ptr3_base[out_offset] = vo3[0];
-              out_ptr3_base[out_offset + out_width] = vo3[1];
-              out_ptr3_base[out_offset + 2 * out_width] = vo3[2];
-              out_ptr3_base[out_offset + 3 * out_width] = vo3[3];
-            }  // w
-          }    // h
-#else
-          for (index_t oc = 0; oc < 4; ++oc) {
-            Conv2dCPUKHxKWCalc(in_ptr_base, filter_ptr0 + oc * in_channels * 7,
-                               in_width, 7, 1, out_height, out_width,
-                               out_ptr0_base + oc * out_image_size, 1);
-          }
-#endif
-        }  // c
-      } else {
-        for (index_t mm = m; mm < out_channels; ++mm) {
-          float *out_ptr0_base =
-              output + b * out_batch_size + mm * out_image_size;
-          for (index_t c = 0; c < in_channels; ++c) {
-            const float *in_ptr_base =
-                input + b * in_batch_size + c * in_image_size;
-            const float *filter_ptr0 = filter + mm * in_channels * 7 + c * 7;
-#if defined(MACE_ENABLE_NEON)
-            /* load filter (1 outch x 4 height x 1 width) */
-            float32x4_t vf00, vf01;
-            vf00 = vld1q_f32(filter_ptr0);
-            vf01 = vld1q_f32(filter_ptr0 + 3);
-
-            for (index_t h = 0; h + 3 < out_height; h += 4) {
-              for (index_t w = 0; w < out_width; ++w) {
-                // load output
-                index_t out_offset = h * out_width + w;
-                // output (1 outch x 4 height x 1 width): vo_outch_height
-                float32x4_t vo0 = {out_ptr0_base[out_offset],
-                                   out_ptr0_base[out_offset + out_width],
-                                   out_ptr0_base[out_offset + 2 * out_width],
-                                   out_ptr0_base[out_offset + 3 * out_width]};
-
-                // input offset
-                index_t in_offset = h * in_width + w;
-                // input (3 slide)
-                float32x4_t vi0 = {in_ptr_base[in_offset],
-                                   in_ptr_base[in_offset + in_width],
-                                   in_ptr_base[in_offset + 2 * in_width],
-                                   in_ptr_base[in_offset + 3 * in_width]};
-                float32x4_t vi4 = {in_ptr_base[in_offset + 4 * in_width],
-                                   in_ptr_base[in_offset + 5 * in_width],
-                                   in_ptr_base[in_offset + 6 * in_width],
-                                   in_ptr_base[in_offset + 7 * in_width]};
-                float32x4_t vi8 = {in_ptr_base[in_offset + 8 * in_width],
-                                   in_ptr_base[in_offset + 9 * in_width],
-                                   in_ptr_base[in_offset + 10 * in_width],
-                                   in_ptr_base[in_offset + 11 * in_width]};
-                float32x4_t vi1 = vextq_f32(vi0, vi4, 1);
-                float32x4_t vi2 = vextq_f32(vi0, vi4, 2);
-                float32x4_t vi3 = vextq_f32(vi0, vi4, 3);
-                float32x4_t vi5 = vextq_f32(vi4, vi8, 1);
-                float32x4_t vi6 = vextq_f32(vi4, vi8, 2);
-
-#if defined(__aarch64__)
-                vo0 = vfmaq_laneq_f32(vo0, vi0, vf00, 0);
-                vo0 = vfmaq_laneq_f32(vo0, vi1, vf00, 1);
-                vo0 = vfmaq_laneq_f32(vo0, vi2, vf00, 2);
-                vo0 = vfmaq_laneq_f32(vo0, vi3, vf00, 3);
-                vo0 = vfmaq_laneq_f32(vo0, vi4, vf01, 1);
-                vo0 = vfmaq_laneq_f32(vo0, vi5, vf01, 2);
-                vo0 = vfmaq_laneq_f32(vo0, vi6, vf01, 3);
-#else
-                vo0 = vmlaq_lane_f32(vo0, vi0, vget_low_f32(vf00), 0);
-                vo0 = vmlaq_lane_f32(vo0, vi1, vget_low_f32(vf00), 1);
-                vo0 = vmlaq_lane_f32(vo0, vi2, vget_high_f32(vf00), 0);
-                vo0 = vmlaq_lane_f32(vo0, vi3, vget_high_f32(vf00), 1);
-                vo0 = vmlaq_lane_f32(vo0, vi4, vget_low_f32(vf01), 1);
-                vo0 = vmlaq_lane_f32(vo0, vi5, vget_high_f32(vf01), 0);
-                vo0 = vmlaq_lane_f32(vo0, vi6, vget_high_f32(vf01), 1);
-#endif
-
-                out_ptr0_base[out_offset] = vo0[0];
-                out_ptr0_base[out_offset + out_width] = vo0[1];
-                out_ptr0_base[out_offset + 2 * out_width] = vo0[2];
-                out_ptr0_base[out_offset + 3 * out_width] = vo0[3];
-              }  // w
-            }    // h
-#else
-            Conv2dCPUKHxKWCalc(in_ptr_base, filter_ptr0, in_width, 7, 1,
-                               out_height, out_width, out_ptr0_base, 1);
-#endif
-          }  // c
-        }
-      }  // if
-    }    // m
-  }      // b
-}
-
-}  // namespace ops
-}  // namespace mace
diff --git a/mace/ops/arm/deconv_2d_neon.h b/mace/ops/arm/deconv_2d_neon.h
index 62e3e919..f45fa923 100644
--- a/mace/ops/arm/deconv_2d_neon.h
+++ b/mace/ops/arm/deconv_2d_neon.h
@@ -1,4 +1,4 @@
-// Copyright 2018 The MACE Authors. All Rights Reserved.
+// Copyright 2019 The MACE Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/mace/ops/arm/deconv_2d_neon_2x2.cc b/mace/ops/arm/deconv_2d_neon_2x2.cc
index a1bd1883..674864c8 100644
--- a/mace/ops/arm/deconv_2d_neon_2x2.cc
+++ b/mace/ops/arm/deconv_2d_neon_2x2.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 The MACE Authors. All Rights Reserved.
+// Copyright 2019 The MACE Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/mace/ops/arm/deconv_2d_neon_3x3.cc b/mace/ops/arm/deconv_2d_neon_3x3.cc
index 2859b7c0..04f62325 100644
--- a/mace/ops/arm/deconv_2d_neon_3x3.cc
+++ b/mace/ops/arm/deconv_2d_neon_3x3.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 The MACE Authors. All Rights Reserved.
+// Copyright 2019 The MACE Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/mace/ops/arm/deconv_2d_neon_4x4.cc b/mace/ops/arm/deconv_2d_neon_4x4.cc
index 690bea4e..443a188f 100644
--- a/mace/ops/arm/deconv_2d_neon_4x4.cc
+++ b/mace/ops/arm/deconv_2d_neon_4x4.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 The MACE Authors. All Rights Reserved.
+// Copyright 2019 The MACE Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/mace/ops/arm/depthwise_conv2d_neon.h b/mace/ops/arm/depthwise_conv2d_neon.h
index a4973ed5..b610178c 100644
--- a/mace/ops/arm/depthwise_conv2d_neon.h
+++ b/mace/ops/arm/depthwise_conv2d_neon.h
@@ -1,4 +1,4 @@
-// Copyright 2018 The MACE Authors. All Rights Reserved.
+// Copyright 2019 The MACE Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/mace/ops/arm/depthwise_conv2d_neon_3x3.cc b/mace/ops/arm/depthwise_conv2d_neon_3x3.cc
index 9d6ae41e..ced509e0 100644
--- a/mace/ops/arm/depthwise_conv2d_neon_3x3.cc
+++ b/mace/ops/arm/depthwise_conv2d_neon_3x3.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 The MACE Authors. All Rights Reserved.
+// Copyright 2019 The MACE Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/mace/ops/arm/depthwise_deconv2d_neon.h b/mace/ops/arm/depthwise_deconv2d_neon.h
index 70f2bb40..8df6dba1 100644
--- a/mace/ops/arm/depthwise_deconv2d_neon.h
+++ b/mace/ops/arm/depthwise_deconv2d_neon.h
@@ -1,4 +1,4 @@
-// Copyright 2018 The MACE Authors. All Rights Reserved.
+// Copyright 2019 The MACE Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/mace/ops/arm/depthwise_deconv2d_neon_3x3.cc b/mace/ops/arm/depthwise_deconv2d_neon_3x3.cc
index a666961b..6bba47c2 100644
--- a/mace/ops/arm/depthwise_deconv2d_neon_3x3.cc
+++ b/mace/ops/arm/depthwise_deconv2d_neon_3x3.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 The MACE Authors. All Rights Reserved.
+// Copyright 2019 The MACE Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/mace/ops/arm/depthwise_deconv2d_neon_4x4.cc b/mace/ops/arm/depthwise_deconv2d_neon_4x4.cc
index a7c10b86..677eb152 100644
--- a/mace/ops/arm/depthwise_deconv2d_neon_4x4.cc
+++ b/mace/ops/arm/depthwise_deconv2d_neon_4x4.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 The MACE Authors. All Rights Reserved.
+// Copyright 2019 The MACE Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/mace/ops/arm/fp32/conv_2d.cc b/mace/ops/arm/fp32/conv_2d.cc
new file mode 100644
index 00000000..799ee521
--- /dev/null
+++ b/mace/ops/arm/fp32/conv_2d.cc
@@ -0,0 +1,247 @@
+// Copyright 2019 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <memory>
+#include <utility>
+#include <algorithm>
+
+#include "mace/ops/arm/fp32/conv_2d.h"
+#include "mace/utils/memory.h"
+
+namespace mace {
+namespace ops {
+namespace arm {
+namespace fp32 {
+
+void Conv2dBase::CalOutputShapeAndPadSize(const Tensor *input,
+                                          const Tensor *filter,
+                                          const int out_tile_height,
+                                          const int out_tile_width,
+                                          std::vector<index_t> *output_shape,
+                                          std::vector<int> *in_pad_size,
+                                          std::vector<int> *out_pad_size) {
+  in_pad_size->resize(4);
+  out_pad_size->resize(4);
+  output_shape->resize(4);
+
+  const index_t in_height = input->dim(2);
+  const index_t in_width = input->dim(3);
+
+  const index_t stride_h = strides_[0];
+  const index_t stride_w = strides_[1];
+  const index_t dilation_h = dilations_[0];
+  const index_t dilation_w = dilations_[1];
+  const index_t filter_h = filter->dim(2);
+  const index_t filter_w = filter->dim(3);
+
+  std::vector<int> paddings(2);
+  if (paddings_.empty()) {
+    CalcNCHWPaddingAndOutputSize(input->shape().data(),
+                                 filter->shape().data(),
+                                 dilations_.data(),
+                                 strides_.data(),
+                                 padding_type_,
+                                 output_shape->data(),
+                                 paddings.data());
+  } else {
+    paddings = paddings_;
+    CalcNCHWOutputSize(input->shape().data(),
+                       filter->shape().data(),
+                       paddings_.data(),
+                       dilations_.data(),
+                       strides_.data(),
+                       RoundType::FLOOR,
+                       output_shape->data());
+  }
+  const index_t out_height = (*output_shape)[2];
+  const index_t out_width = (*output_shape)[3];
+  const index_t
+      padded_out_height = RoundUp<index_t>(out_height, out_tile_height);
+  const index_t padded_out_width = RoundUp<index_t>(out_width, out_tile_width);
+  const index_t padded_in_height =
+      std::max(in_height + paddings[0], (padded_out_height - 1) * stride_h
+          + (filter_h - 1) * dilation_h + 1);
+  const index_t padded_in_width =
+      std::max(in_width + paddings[1], (padded_out_width - 1) * stride_w
+          + (filter_w - 1) * dilation_w + 1);
+
+  (*in_pad_size)[0] = paddings[0] >> 1;
+  (*in_pad_size)[1] =
+      static_cast<int>(padded_in_height - in_height - (*in_pad_size)[0]);
+  (*in_pad_size)[2] = paddings[1] >> 1;
+  (*in_pad_size)[3] =
+      static_cast<int>(padded_in_width - in_width - (*in_pad_size)[2]);
+
+  (*out_pad_size)[0] = 0;
+  (*out_pad_size)[1] = static_cast<int>(padded_out_height - out_height);
+  (*out_pad_size)[2] = 0;
+  (*out_pad_size)[3] = static_cast<int>(padded_out_width - out_width);
+}
+
+MaceStatus Conv2dBase::ResizeOutAndPadInOut(const OpContext *context,
+                                            const Tensor *input,
+                                            const Tensor *filter,
+                                            Tensor *output,
+                                            const int out_tile_height,
+                                            const int out_tile_width,
+                                            std::unique_ptr<const Tensor>
+                                                *padded_input,
+                                            std::unique_ptr<Tensor>
+                                                *padded_output) {
+  std::vector<index_t> output_shape;
+  std::vector<int> in_pad_size;
+  std::vector<int> out_pad_size;
+  CalOutputShapeAndPadSize(input,
+                           filter,
+                           out_tile_height,
+                           out_tile_width,
+                           &output_shape,
+                           &in_pad_size,
+                           &out_pad_size);
+  MACE_RETURN_IF_ERROR(output->Resize(output_shape));
+
+  const index_t batch = input->dim(0);
+  const index_t in_channels = input->dim(1);
+  const index_t in_height = input->dim(2);
+  const index_t in_width = input->dim(3);
+  const index_t out_channels = output->dim(1);
+  const index_t out_height = output->dim(2);
+  const index_t out_width = output->dim(3);
+
+  const index_t padded_in_height = in_height + in_pad_size[0] + in_pad_size[1];
+  const index_t padded_in_width = in_width + in_pad_size[2] + in_pad_size[3];
+  const index_t
+      padded_out_height = out_height + out_pad_size[0] + out_pad_size[1];
+  const index_t
+      padded_out_width = out_width + out_pad_size[2] + out_pad_size[3];
+  const bool is_in_padded =
+      padded_in_height != in_height || padded_in_width != in_width;
+  const bool is_out_padded =
+      padded_out_height != out_height || padded_out_width != out_width;
+
+  auto scratch_buffer = context->device()->scratch_buffer();
+  const index_t padded_in_size =
+      MACE_EXTRA_BUFFER_PAD_SIZE + (is_in_padded ? PadAlignSize(
+          sizeof(float) * batch * in_channels * padded_in_height
+              * padded_in_width) : 0);
+  const index_t padded_out_size = is_out_padded ? PadAlignSize(
+      sizeof(float) * batch * out_channels * padded_out_height
+          * padded_out_width) : 0;
+
+  scratch_buffer->Rewind();
+  scratch_buffer->GrowSize(padded_in_size + padded_out_size);
+  if (is_in_padded) {
+    std::unique_ptr<Tensor>
+        padded_in =
+        make_unique<Tensor>(scratch_buffer->Scratch(padded_in_size),
+                            DataType::DT_FLOAT);
+    padded_in->Resize({batch, in_channels, padded_in_height, padded_in_width});
+    PadInput(*input, in_pad_size[0], in_pad_size[2], padded_in.get());
+    *padded_input = std::move(padded_in);
+  }
+  if (is_out_padded) {
+    std::unique_ptr<Tensor>
+    padded_out = make_unique<Tensor>(scratch_buffer->Scratch(padded_out_size),
+                                     DataType::DT_FLOAT);
+    padded_out->Resize({batch, out_channels, padded_out_height,
+                        padded_out_width});
+    *padded_output = std::move(padded_out);
+  }
+  return MaceStatus::MACE_SUCCESS;
+}
+
+void Conv2dBase::PadInput(const Tensor &src,
+                          const int pad_top,
+                          const int pad_left,
+                          mace::Tensor *dst) {
+  if (dst == &src) return;
+  const index_t batch = src.dim(0);
+  const index_t channels = src.dim(1);
+  const index_t height = src.dim(2);
+  const index_t width = src.dim(3);
+  const index_t padded_height = dst->dim(2);
+  const index_t padded_width = dst->dim(3);
+  const int pad_bottom = static_cast<int>(padded_height - height - pad_top);
+  const int pad_right = static_cast<int>(padded_width - width - pad_left);
+  auto in_data = src.data<float>();
+  auto padded_in_data = dst->mutable_data<float>();
+
+  const index_t img_size = height * width;
+  const index_t padded_img_size = padded_height * padded_width;
+
+#pragma omp parallel for collapse(2) schedule(runtime)
+  for (index_t b = 0; b < batch; ++b) {
+    for (index_t c = 0; c < channels; ++c) {
+      const index_t bc = b * channels + c;
+      const float *in_base = in_data + bc * img_size;
+      float *padded_in_base = padded_in_data + bc * padded_img_size;
+
+      memset(padded_in_base, 0, sizeof(float) * pad_top * padded_width);
+      padded_in_base += pad_top * padded_width;
+      for (index_t h = 0; h < height; ++h) {
+        memset(padded_in_base,
+               0,
+               sizeof(float) * pad_left);
+        memcpy(padded_in_base + pad_left,
+               in_base,
+               sizeof(float) * width);
+        memset(padded_in_base + pad_left + width,
+               0,
+               sizeof(float) * pad_right);
+        in_base += width;
+        padded_in_base += padded_width;
+      }
+      memset(padded_in_base, 0, sizeof(float) * pad_bottom * padded_width);
+    }
+  }
+}
+
+void Conv2dBase::UnPadOutput(const mace::Tensor &src, mace::Tensor *dst) {
+  if (dst == &src) return;
+  const index_t batch = dst->dim(0);
+  const index_t channels = dst->dim(1);
+  const index_t height = dst->dim(2);
+  const index_t width = dst->dim(3);
+  const index_t padded_height = src.dim(2);
+  const index_t padded_width = src.dim(3);
+
+  auto padded_out_data = src.data<float>();
+  auto out_data = dst->mutable_data<float>();
+
+  const index_t img_size = height * width;
+  const index_t padded_img_size = padded_height * padded_width;
+
+#pragma omp parallel for collapse(2) schedule(runtime)
+  for (index_t b = 0; b < batch; ++b) {
+    for (index_t c = 0; c < channels; ++c) {
+      const index_t bc = (b * channels + c);
+      float *out_base = out_data + bc * img_size;
+      const float *padded_out_base = padded_out_data + bc * padded_img_size;
+
+      for (index_t h = 0; h < height; ++h) {
+        memcpy(out_base,
+               padded_out_base,
+               sizeof(float) * width);
+        out_base += width;
+        padded_out_base += padded_width;
+      }  // h
+    }  // c
+  }  // b
+}
+
+}  // namespace fp32
+}  // namespace arm
+}  // namespace ops
+}  // namespace mace
+
diff --git a/mace/ops/arm/fp32/conv_2d.h b/mace/ops/arm/fp32/conv_2d.h
index 7d77cf14..832f6f2f 100644
--- a/mace/ops/arm/fp32/conv_2d.h
+++ b/mace/ops/arm/fp32/conv_2d.h
@@ -1,4 +1,4 @@
-// Copyright 2018 The MACE Authors. All Rights Reserved.
+// Copyright 2019 The MACE Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -15,10 +15,14 @@
 #ifndef MACE_OPS_ARM_FP32_CONV_2D_H_
 #define MACE_OPS_ARM_FP32_CONV_2D_H_
 
+#include <vector>
+#include <memory>
+
 #include "mace/public/mace.h"
 #include "mace/core/tensor.h"
 #include "mace/core/op_context.h"
 #include "mace/ops/arm/fp32/gemm.h"
+#include "mace/ops/common/conv_pool_2d_util.h"
 
 namespace mace {
 namespace ops {
@@ -27,13 +31,51 @@ namespace fp32 {
 
 class Conv2dBase {
  public:
-  Conv2dBase() = default;
+  Conv2dBase(const std::vector<int> strides,
+             const std::vector<int> dilations,
+             const std::vector<int> paddings,
+             const Padding padding_type)
+      : strides_(strides),
+        dilations_(dilations),
+        paddings_(paddings),
+        padding_type_(padding_type) {}
+
   virtual ~Conv2dBase() = default;
+
   virtual MaceStatus Compute(
       const OpContext *context,
       const Tensor *input,
       const Tensor *filter,
       Tensor *output) = 0;
+
+ protected:
+  void CalOutputShapeAndPadSize(const Tensor *input,
+                                const Tensor *filter,
+                                const int out_tile_height,
+                                const int out_tile_width,
+                                std::vector<index_t> *output_shape,
+                                std::vector<int> *in_pad_size,
+                                std::vector<int> *out_pad_size);
+
+  MaceStatus ResizeOutAndPadInOut(const OpContext *context,
+                                  const Tensor *input,
+                                  const Tensor *filter,
+                                  Tensor *output,
+                                  const int out_tile_height,
+                                  const int out_tile_width,
+                                  std::unique_ptr<const Tensor> *padded_input,
+                                  std::unique_ptr<Tensor> *padded_output);
+
+  void PadInput(const Tensor &src,
+                const int pad_top,
+                const int pad_left,
+                Tensor *dst);
+  void UnPadOutput(const Tensor &src, Tensor *dst);
+
+  const std::vector<int> strides_;
+  const std::vector<int> dilations_;
+  const std::vector<int> paddings_;
+  const Padding padding_type_;
 };
 
 }  // namespace fp32
diff --git a/mace/ops/arm/fp32/conv_2d_1x1.cc b/mace/ops/arm/fp32/conv_2d_1x1.cc
index b34e19aa..d5e03652 100644
--- a/mace/ops/arm/fp32/conv_2d_1x1.cc
+++ b/mace/ops/arm/fp32/conv_2d_1x1.cc
@@ -12,7 +12,6 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-
 #include "mace/ops/arm/fp32/conv_2d_1x1.h"
 
 namespace mace {
@@ -25,20 +24,68 @@ MaceStatus Conv2dK1x1::Compute(const OpContext *context,
                                const Tensor *filter,
                                Tensor *output) {
   index_t batch = input->dim(0);
-  index_t height = input->dim(2);
-  index_t width = input->dim(3);
+  index_t in_height = input->dim(2);
+  index_t in_width = input->dim(3);
   index_t in_channels = input->dim(1);
-  index_t out_channels = filter->dim(0);
-  MACE_RETURN_IF_ERROR(output->Resize({batch, out_channels, height, width}));
-  context->device()->scratch_buffer()->Rewind();
+
+  std::vector<index_t> output_shape;
+  std::vector<int> in_pad_size;
+  std::vector<int> out_pad_size;
+  CalOutputShapeAndPadSize(input,
+                           filter,
+                           1,
+                           1,
+                           &output_shape,
+                           &in_pad_size,
+                           &out_pad_size);
+  MACE_RETURN_IF_ERROR(output->Resize(output_shape));
+
+  const index_t out_channels = output_shape[1];
+  const index_t out_height = output_shape[2];
+  const index_t out_width = output_shape[3];
+  const index_t padded_in_height = in_height + in_pad_size[0] + in_pad_size[1];
+  const index_t padded_in_width = in_width + in_pad_size[2] + in_pad_size[3];
+
+  // pad input and transform input
+  const bool is_in_padded =
+      in_height != padded_in_height || in_width != padded_in_width;
+  auto scratch_buffer = context->device()->scratch_buffer();
+  const index_t padded_in_size = is_in_padded ? PadAlignSize(
+      sizeof(float) * batch * in_channels * padded_in_height
+          * padded_in_width) : 0;
+  const index_t pack_filter_size =
+      PadAlignSize(sizeof(float) * out_channels * in_channels);
+  const index_t pack_input_size =
+      PadAlignSize(
+          sizeof(float) * in_channels * padded_in_height * padded_in_width);
+  const index_t pack_output_size =
+      PadAlignSize(
+          sizeof(float) * out_channels * padded_in_height * padded_in_width);
+
+  const index_t gemm_pack_size =
+      pack_filter_size + pack_input_size + pack_output_size;
+
+  scratch_buffer->Rewind();
+  scratch_buffer->GrowSize(padded_in_size + gemm_pack_size);
+
+  const Tensor *padded_in = input;
+  Tensor tmp_padded_in
+      (scratch_buffer->Scratch(padded_in_size), DataType::DT_FLOAT);
+  if (is_in_padded) {
+    tmp_padded_in.Resize({batch, in_channels, padded_in_height,
+                          padded_in_width});
+    PadInput(*input, in_pad_size[0], in_pad_size[2], &tmp_padded_in);
+    padded_in = &tmp_padded_in;
+  }
+
   return gemm_.Compute(context,
                        filter,
-                       input,
+                       padded_in,
                        batch,
                        out_channels,
                        in_channels,
                        in_channels,
-                       height * width,
+                       out_height * out_width,
                        false,
                        false,
                        false,
diff --git a/mace/ops/arm/fp32/conv_2d_1x1.h b/mace/ops/arm/fp32/conv_2d_1x1.h
index fd2077ec..68b792fd 100644
--- a/mace/ops/arm/fp32/conv_2d_1x1.h
+++ b/mace/ops/arm/fp32/conv_2d_1x1.h
@@ -1,4 +1,4 @@
-// Copyright 2018 The MACE Authors. All Rights Reserved.
+// Copyright 2019 The MACE Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -15,6 +15,7 @@
 #ifndef MACE_OPS_ARM_FP32_CONV_2D_1X1_H_
 #define MACE_OPS_ARM_FP32_CONV_2D_1X1_H_
 
+#include <vector>
 #include "mace/public/mace.h"
 #include "mace/core/tensor.h"
 #include "mace/core/op_context.h"
@@ -28,7 +29,8 @@ namespace fp32 {
 
 class Conv2dK1x1 : public Conv2dBase {
  public:
-  Conv2dK1x1() : gemm_(true) {}
+  Conv2dK1x1(const std::vector<int> paddings, const Padding padding_type)
+      : Conv2dBase({1, 1}, {1, 1}, paddings, padding_type) {}
   virtual ~Conv2dK1x1() {}
 
   MaceStatus Compute(
diff --git a/mace/ops/arm/fp32/conv_2d_1xn.cc b/mace/ops/arm/fp32/conv_2d_1xn.cc
new file mode 100644
index 00000000..1ff99d80
--- /dev/null
+++ b/mace/ops/arm/fp32/conv_2d_1xn.cc
@@ -0,0 +1,821 @@
+// Copyright 2019 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <arm_neon.h>
+#include <memory>
+#include "mace/ops/arm/fp32/conv_2d_1xn.h"
+
+namespace mace {
+namespace ops {
+namespace arm {
+namespace fp32 {
+
+MaceStatus Conv2dK1x7S1::Compute(const OpContext *context,
+                                 const Tensor *input,
+                                 const Tensor *filter,
+                                 Tensor *output) {
+  std::unique_ptr<const Tensor> padded_input;
+  std::unique_ptr<Tensor> padded_output;
+
+  ResizeOutAndPadInOut(context,
+                       input,
+                       filter,
+                       output,
+                       1,
+                       4,
+                       &padded_input,
+                       &padded_output);
+  const Tensor *in_tensor = input;
+  if (padded_input.get() != nullptr) {
+    in_tensor = padded_input.get();
+  }
+  Tensor *out_tensor = output;
+  if (padded_output.get() != nullptr) {
+    out_tensor = padded_output.get();
+  }
+  out_tensor->Clear();
+
+  Tensor::MappingGuard in_guard(input);
+  Tensor::MappingGuard filter_guard(filter);
+  Tensor::MappingGuard out_guard(output);
+  auto filter_data = filter->data<float>();
+  auto input_data = in_tensor->data<float>();
+  auto output_data = out_tensor->mutable_data<float>();
+
+  auto in_shape = in_tensor->shape();
+  auto out_shape = out_tensor->shape();
+
+  const index_t in_image_size = in_shape[2] * in_shape[3];
+  const index_t out_image_size = out_shape[2] * out_shape[3];
+  const index_t in_batch_size = in_shape[1] * in_image_size;
+  const index_t out_batch_size = out_shape[1] * out_image_size;
+
+#pragma omp parallel for collapse(2) schedule(runtime)
+  for (index_t b = 0; b < out_shape[0]; ++b) {
+    for (index_t m = 0; m < out_shape[1]; m += 4) {
+      const index_t out_channels = out_shape[1];
+      const index_t out_height = out_shape[2];
+      const index_t out_width = out_shape[3];
+      const index_t in_channels = in_shape[1];
+      const index_t in_width = in_shape[3];
+      if (m + 3 < out_channels) {
+        float *out_ptr0_base =
+            output_data + b * out_batch_size + m * out_image_size;
+        float *out_ptr1_base =
+            output_data + b * out_batch_size + (m + 1) * out_image_size;
+        float *out_ptr2_base =
+            output_data + b * out_batch_size + (m + 2) * out_image_size;
+        float *out_ptr3_base =
+            output_data + b * out_batch_size + (m + 3) * out_image_size;
+        for (index_t c = 0; c < in_channels; ++c) {
+          const float *in_ptr_base =
+              input_data + b * in_batch_size + c * in_image_size;
+          const float *filter_ptr0 = filter_data + m * in_channels * 7 + c * 7;
+          const float
+              *filter_ptr1 = filter_data + (m + 1) * in_channels * 7 + c * 7;
+          const float
+              *filter_ptr2 = filter_data + (m + 2) * in_channels * 7 + c * 7;
+          const float
+              *filter_ptr3 = filter_data + (m + 3) * in_channels * 7 + c * 7;
+          /* load filter (4 outch x 1 height x 4 width) */
+          float32x4_t vf00, vf01;
+          float32x4_t vf10, vf11;
+          float32x4_t vf20, vf21;
+          float32x4_t vf30, vf31;
+          vf00 = vld1q_f32(filter_ptr0);
+          vf01 = vld1q_f32(filter_ptr0 + 3);
+          vf10 = vld1q_f32(filter_ptr1);
+          vf11 = vld1q_f32(filter_ptr1 + 3);
+          vf20 = vld1q_f32(filter_ptr2);
+          vf21 = vld1q_f32(filter_ptr2 + 3);
+          vf30 = vld1q_f32(filter_ptr3);
+          vf31 = vld1q_f32(filter_ptr3 + 3);
+
+          for (index_t h = 0; h < out_height; ++h) {
+            for (index_t w = 0; w + 3 < out_width; w += 4) {
+              // output (4 outch x 1 height x 4 width): vo_outch_height
+              float32x4_t vo0, vo1, vo2, vo3;
+              // load output
+              index_t out_offset = h * out_width + w;
+              vo0 = vld1q_f32(out_ptr0_base + out_offset);
+              vo1 = vld1q_f32(out_ptr1_base + out_offset);
+              vo2 = vld1q_f32(out_ptr2_base + out_offset);
+              vo3 = vld1q_f32(out_ptr3_base + out_offset);
+
+              // input (3 slide)
+              float32x4_t vi0, vi1, vi2, vi3, vi4, vi5, vi6, vi8;
+              // input offset
+              index_t in_offset = h * in_width + w;
+              // load input
+              vi0 = vld1q_f32(in_ptr_base + in_offset);
+              vi4 = vld1q_f32(in_ptr_base + in_offset + 4);
+              vi8 = vld1q_f32(in_ptr_base + in_offset + 8);
+              vi1 = vextq_f32(vi0, vi4, 1);
+              vi2 = vextq_f32(vi0, vi4, 2);
+              vi3 = vextq_f32(vi0, vi4, 3);
+              vi5 = vextq_f32(vi4, vi8, 1);
+              vi6 = vextq_f32(vi4, vi8, 2);
+
+#if defined(__aarch64__)
+              /* outch 0 */
+              vo0 = vfmaq_laneq_f32(vo0, vi0, vf00, 0);
+              vo0 = vfmaq_laneq_f32(vo0, vi1, vf00, 1);
+              vo0 = vfmaq_laneq_f32(vo0, vi2, vf00, 2);
+              vo0 = vfmaq_laneq_f32(vo0, vi3, vf00, 3);
+              vo0 = vfmaq_laneq_f32(vo0, vi4, vf01, 1);
+              vo0 = vfmaq_laneq_f32(vo0, vi5, vf01, 2);
+              vo0 = vfmaq_laneq_f32(vo0, vi6, vf01, 3);
+              /* outch 1 */
+              vo1 = vfmaq_laneq_f32(vo1, vi0, vf10, 0);
+              vo1 = vfmaq_laneq_f32(vo1, vi1, vf10, 1);
+              vo1 = vfmaq_laneq_f32(vo1, vi2, vf10, 2);
+              vo1 = vfmaq_laneq_f32(vo1, vi3, vf10, 3);
+              vo1 = vfmaq_laneq_f32(vo1, vi4, vf11, 1);
+              vo1 = vfmaq_laneq_f32(vo1, vi5, vf11, 2);
+              vo1 = vfmaq_laneq_f32(vo1, vi6, vf11, 3);
+              /* outch 2 */
+              vo2 = vfmaq_laneq_f32(vo2, vi0, vf20, 0);
+              vo2 = vfmaq_laneq_f32(vo2, vi1, vf20, 1);
+              vo2 = vfmaq_laneq_f32(vo2, vi2, vf20, 2);
+              vo2 = vfmaq_laneq_f32(vo2, vi3, vf20, 3);
+              vo2 = vfmaq_laneq_f32(vo2, vi4, vf21, 1);
+              vo2 = vfmaq_laneq_f32(vo2, vi5, vf21, 2);
+              vo2 = vfmaq_laneq_f32(vo2, vi6, vf21, 3);
+              /* outch 3 */
+              vo3 = vfmaq_laneq_f32(vo3, vi0, vf30, 0);
+              vo3 = vfmaq_laneq_f32(vo3, vi1, vf30, 1);
+              vo3 = vfmaq_laneq_f32(vo3, vi2, vf30, 2);
+              vo3 = vfmaq_laneq_f32(vo3, vi3, vf30, 3);
+              vo3 = vfmaq_laneq_f32(vo3, vi4, vf31, 1);
+              vo3 = vfmaq_laneq_f32(vo3, vi5, vf31, 2);
+              vo3 = vfmaq_laneq_f32(vo3, vi6, vf31, 3);
+#else
+              /* outch 0 */
+              vo0 = vmlaq_lane_f32(vo0, vi0, vget_low_f32(vf00), 0);
+              vo0 = vmlaq_lane_f32(vo0, vi1, vget_low_f32(vf00), 1);
+              vo0 = vmlaq_lane_f32(vo0, vi2, vget_high_f32(vf00), 0);
+              vo0 = vmlaq_lane_f32(vo0, vi3, vget_high_f32(vf00), 1);
+              vo0 = vmlaq_lane_f32(vo0, vi4, vget_low_f32(vf01), 1);
+              vo0 = vmlaq_lane_f32(vo0, vi5, vget_high_f32(vf01), 0);
+              vo0 = vmlaq_lane_f32(vo0, vi6, vget_high_f32(vf01), 1);
+              /* outch 1 */
+              vo1 = vmlaq_lane_f32(vo1, vi0, vget_low_f32(vf10), 0);
+              vo1 = vmlaq_lane_f32(vo1, vi1, vget_low_f32(vf10), 1);
+              vo1 = vmlaq_lane_f32(vo1, vi2, vget_high_f32(vf10), 0);
+              vo1 = vmlaq_lane_f32(vo1, vi3, vget_high_f32(vf10), 1);
+              vo1 = vmlaq_lane_f32(vo1, vi4, vget_low_f32(vf11), 1);
+              vo1 = vmlaq_lane_f32(vo1, vi5, vget_high_f32(vf11), 0);
+              vo1 = vmlaq_lane_f32(vo1, vi6, vget_high_f32(vf11), 1);
+              /* outch 2 */
+              vo2 = vmlaq_lane_f32(vo2, vi0, vget_low_f32(vf20), 0);
+              vo2 = vmlaq_lane_f32(vo2, vi1, vget_low_f32(vf20), 1);
+              vo2 = vmlaq_lane_f32(vo2, vi2, vget_high_f32(vf20), 0);
+              vo2 = vmlaq_lane_f32(vo2, vi3, vget_high_f32(vf20), 1);
+              vo2 = vmlaq_lane_f32(vo2, vi4, vget_low_f32(vf21), 1);
+              vo2 = vmlaq_lane_f32(vo2, vi5, vget_high_f32(vf21), 0);
+              vo2 = vmlaq_lane_f32(vo2, vi6, vget_high_f32(vf21), 1);
+              /* outch 3 */
+              vo3 = vmlaq_lane_f32(vo3, vi0, vget_low_f32(vf30), 0);
+              vo3 = vmlaq_lane_f32(vo3, vi1, vget_low_f32(vf30), 1);
+              vo3 = vmlaq_lane_f32(vo3, vi2, vget_high_f32(vf30), 0);
+              vo3 = vmlaq_lane_f32(vo3, vi3, vget_high_f32(vf30), 1);
+              vo3 = vmlaq_lane_f32(vo3, vi4, vget_low_f32(vf31), 1);
+              vo3 = vmlaq_lane_f32(vo3, vi5, vget_high_f32(vf31), 0);
+              vo3 = vmlaq_lane_f32(vo3, vi6, vget_high_f32(vf31), 1);
+#endif
+
+              vst1q_f32(out_ptr0_base + out_offset, vo0);
+              vst1q_f32(out_ptr1_base + out_offset, vo1);
+              vst1q_f32(out_ptr2_base + out_offset, vo2);
+              vst1q_f32(out_ptr3_base + out_offset, vo3);
+            }  // w
+          }    // h
+        }  // c
+      } else {
+        for (index_t mm = m; mm < out_channels; ++mm) {
+          float *out_ptr0_base =
+              output_data + b * out_batch_size + mm * out_image_size;
+          for (index_t c = 0; c < in_channels; ++c) {
+            const float *in_ptr_base =
+                input_data + b * in_batch_size + c * in_image_size;
+            const float
+                *filter_ptr0 = filter_data + mm * in_channels * 7 + c * 7;
+            /* load filter (1 outch x 1 height x 4 width) */
+            float32x4_t vf00, vf01;
+            vf00 = vld1q_f32(filter_ptr0);
+            vf01 = vld1q_f32(filter_ptr0 + 3);
+
+            for (index_t h = 0; h < out_height; ++h) {
+              for (index_t w = 0; w + 3 < out_width; w += 4) {
+                // output (1 outch x 1 height x 4 width): vo_outch_height
+                float32x4_t vo0;
+                // load output
+                index_t out_offset = h * out_width + w;
+                vo0 = vld1q_f32(out_ptr0_base + out_offset);
+
+                // input (3 slide)
+                float32x4_t vi0, vi1, vi2, vi3, vi4, vi5, vi6, vi8;
+                // input offset
+                index_t in_offset = h * in_width + w;
+                // load input
+                vi0 = vld1q_f32(in_ptr_base + in_offset);
+                vi4 = vld1q_f32(in_ptr_base + in_offset + 4);
+                vi8 = vld1q_f32(in_ptr_base + in_offset + 8);
+                vi1 = vextq_f32(vi0, vi4, 1);
+                vi2 = vextq_f32(vi0, vi4, 2);
+                vi3 = vextq_f32(vi0, vi4, 3);
+                vi5 = vextq_f32(vi4, vi8, 1);
+                vi6 = vextq_f32(vi4, vi8, 2);
+
+#if defined(__aarch64__)
+                vo0 = vfmaq_laneq_f32(vo0, vi0, vf00, 0);
+                vo0 = vfmaq_laneq_f32(vo0, vi1, vf00, 1);
+                vo0 = vfmaq_laneq_f32(vo0, vi2, vf00, 2);
+                vo0 = vfmaq_laneq_f32(vo0, vi3, vf00, 3);
+                vo0 = vfmaq_laneq_f32(vo0, vi4, vf01, 1);
+                vo0 = vfmaq_laneq_f32(vo0, vi5, vf01, 2);
+                vo0 = vfmaq_laneq_f32(vo0, vi6, vf01, 3);
+#else
+                vo0 = vmlaq_lane_f32(vo0, vi0, vget_low_f32(vf00), 0);
+                vo0 = vmlaq_lane_f32(vo0, vi1, vget_low_f32(vf00), 1);
+                vo0 = vmlaq_lane_f32(vo0, vi2, vget_high_f32(vf00), 0);
+                vo0 = vmlaq_lane_f32(vo0, vi3, vget_high_f32(vf00), 1);
+                vo0 = vmlaq_lane_f32(vo0, vi4, vget_low_f32(vf01), 1);
+                vo0 = vmlaq_lane_f32(vo0, vi5, vget_high_f32(vf01), 0);
+                vo0 = vmlaq_lane_f32(vo0, vi6, vget_high_f32(vf01), 1);
+#endif
+
+                vst1q_f32(out_ptr0_base + out_offset, vo0);
+              }  // w
+            }    // h
+          }  // c
+        }
+      }  // if
+    }    // m
+  }      // b
+  UnPadOutput(*out_tensor, output);
+  return MaceStatus::MACE_SUCCESS;
+}
+
+MaceStatus Conv2dK7x1S1::Compute(const OpContext *context,
+                                 const Tensor *input,
+                                 const Tensor *filter,
+                                 Tensor *output) {
+  std::unique_ptr<const Tensor> padded_input;
+  std::unique_ptr<Tensor> padded_output;
+
+  ResizeOutAndPadInOut(context,
+                       input,
+                       filter,
+                       output,
+                       4,
+                       1,
+                       &padded_input,
+                       &padded_output);
+  const Tensor *in_tensor = input;
+  if (padded_input.get() != nullptr) {
+    in_tensor = padded_input.get();
+  }
+  Tensor *out_tensor = output;
+  if (padded_output.get() != nullptr) {
+    out_tensor = padded_output.get();
+  }
+  out_tensor->Clear();
+
+  Tensor::MappingGuard in_guard(input);
+  Tensor::MappingGuard filter_guard(filter);
+  Tensor::MappingGuard out_guard(output);
+  auto filter_data = filter->data<float>();
+  auto input_data = in_tensor->data<float>();
+  auto output_data = out_tensor->mutable_data<float>();
+
+  auto in_shape = in_tensor->shape();
+  auto out_shape = out_tensor->shape();
+
+  const index_t in_image_size = in_shape[2] * in_shape[3];
+  const index_t out_image_size = out_shape[2] * out_shape[3];
+  const index_t in_batch_size = in_shape[1] * in_image_size;
+  const index_t out_batch_size = out_shape[1] * out_image_size;
+
+#pragma omp parallel for collapse(2) schedule(runtime)
+  for (index_t b = 0; b < out_shape[0]; ++b) {
+    for (index_t m = 0; m < out_shape[1]; m += 4) {
+      const index_t out_channels = out_shape[1];
+      const index_t out_height = out_shape[2];
+      const index_t out_width = out_shape[3];
+      const index_t in_channels = in_shape[1];
+      const index_t in_width = in_shape[3];
+      if (m + 3 < out_channels) {
+        float *out_ptr0_base =
+            output_data + b * out_batch_size + m * out_image_size;
+        float *out_ptr1_base =
+            output_data + b * out_batch_size + (m + 1) * out_image_size;
+        float *out_ptr2_base =
+            output_data + b * out_batch_size + (m + 2) * out_image_size;
+        float *out_ptr3_base =
+            output_data + b * out_batch_size + (m + 3) * out_image_size;
+        for (index_t c = 0; c < in_channels; ++c) {
+          const float *in_ptr_base =
+              input_data + b * in_batch_size + c * in_image_size;
+          const float *filter_ptr0 = filter_data + m * in_channels * 7 + c * 7;
+          const float
+              *filter_ptr1 = filter_data + (m + 1) * in_channels * 7 + c * 7;
+          const float
+              *filter_ptr2 = filter_data + (m + 2) * in_channels * 7 + c * 7;
+          const float
+              *filter_ptr3 = filter_data + (m + 3) * in_channels * 7 + c * 7;
+          /* load filter (4 outch x 4 height x 1 width) */
+          float32x4_t vf00, vf01;
+          float32x4_t vf10, vf11;
+          float32x4_t vf20, vf21;
+          float32x4_t vf30, vf31;
+          vf00 = vld1q_f32(filter_ptr0);
+          vf01 = vld1q_f32(filter_ptr0 + 3);
+          vf10 = vld1q_f32(filter_ptr1);
+          vf11 = vld1q_f32(filter_ptr1 + 3);
+          vf20 = vld1q_f32(filter_ptr2);
+          vf21 = vld1q_f32(filter_ptr2 + 3);
+          vf30 = vld1q_f32(filter_ptr3);
+          vf31 = vld1q_f32(filter_ptr3 + 3);
+
+          for (index_t h = 0; h + 3 < out_height; h += 4) {
+            for (index_t w = 0; w < out_width; ++w) {
+              // load output
+              index_t out_offset = h * out_width + w;
+              // output (4 outch x 4 height x 1 width): vo_outch_height
+              float32x4_t vo0 = {out_ptr0_base[out_offset],
+                                 out_ptr0_base[out_offset + out_width],
+                                 out_ptr0_base[out_offset + 2 * out_width],
+                                 out_ptr0_base[out_offset + 3 * out_width]};
+              float32x4_t vo1 = {out_ptr1_base[out_offset],
+                                 out_ptr1_base[out_offset + out_width],
+                                 out_ptr1_base[out_offset + 2 * out_width],
+                                 out_ptr1_base[out_offset + 3 * out_width]};
+              float32x4_t vo2 = {out_ptr2_base[out_offset],
+                                 out_ptr2_base[out_offset + out_width],
+                                 out_ptr2_base[out_offset + 2 * out_width],
+                                 out_ptr2_base[out_offset + 3 * out_width]};
+              float32x4_t vo3 = {out_ptr3_base[out_offset],
+                                 out_ptr3_base[out_offset + out_width],
+                                 out_ptr3_base[out_offset + 2 * out_width],
+                                 out_ptr3_base[out_offset + 3 * out_width]};
+
+              // input offset
+              index_t in_offset = h * in_width + w;
+              // input (3 slide)
+              float32x4_t vi0 = {in_ptr_base[in_offset],
+                                 in_ptr_base[in_offset + in_width],
+                                 in_ptr_base[in_offset + 2 * in_width],
+                                 in_ptr_base[in_offset + 3 * in_width]};
+              float32x4_t vi4 = {in_ptr_base[in_offset + 4 * in_width],
+                                 in_ptr_base[in_offset + 5 * in_width],
+                                 in_ptr_base[in_offset + 6 * in_width],
+                                 in_ptr_base[in_offset + 7 * in_width]};
+              float32x4_t vi8 = {in_ptr_base[in_offset + 8 * in_width],
+                                 in_ptr_base[in_offset + 9 * in_width]};
+              float32x4_t vi1 = vextq_f32(vi0, vi4, 1);
+              float32x4_t vi2 = vextq_f32(vi0, vi4, 2);
+              float32x4_t vi3 = vextq_f32(vi0, vi4, 3);
+              float32x4_t vi5 = vextq_f32(vi4, vi8, 1);
+              float32x4_t vi6 = vextq_f32(vi4, vi8, 2);
+
+#if defined(__aarch64__)
+              /* outch 0 */
+              vo0 = vfmaq_laneq_f32(vo0, vi0, vf00, 0);
+              vo0 = vfmaq_laneq_f32(vo0, vi1, vf00, 1);
+              vo0 = vfmaq_laneq_f32(vo0, vi2, vf00, 2);
+              vo0 = vfmaq_laneq_f32(vo0, vi3, vf00, 3);
+              vo0 = vfmaq_laneq_f32(vo0, vi4, vf01, 1);
+              vo0 = vfmaq_laneq_f32(vo0, vi5, vf01, 2);
+              vo0 = vfmaq_laneq_f32(vo0, vi6, vf01, 3);
+              /* outch 1 */
+              vo1 = vfmaq_laneq_f32(vo1, vi0, vf10, 0);
+              vo1 = vfmaq_laneq_f32(vo1, vi1, vf10, 1);
+              vo1 = vfmaq_laneq_f32(vo1, vi2, vf10, 2);
+              vo1 = vfmaq_laneq_f32(vo1, vi3, vf10, 3);
+              vo1 = vfmaq_laneq_f32(vo1, vi4, vf11, 1);
+              vo1 = vfmaq_laneq_f32(vo1, vi5, vf11, 2);
+              vo1 = vfmaq_laneq_f32(vo1, vi6, vf11, 3);
+              /* outch 2 */
+              vo2 = vfmaq_laneq_f32(vo2, vi0, vf20, 0);
+              vo2 = vfmaq_laneq_f32(vo2, vi1, vf20, 1);
+              vo2 = vfmaq_laneq_f32(vo2, vi2, vf20, 2);
+              vo2 = vfmaq_laneq_f32(vo2, vi3, vf20, 3);
+              vo2 = vfmaq_laneq_f32(vo2, vi4, vf21, 1);
+              vo2 = vfmaq_laneq_f32(vo2, vi5, vf21, 2);
+              vo2 = vfmaq_laneq_f32(vo2, vi6, vf21, 3);
+              /* outch 3 */
+              vo3 = vfmaq_laneq_f32(vo3, vi0, vf30, 0);
+              vo3 = vfmaq_laneq_f32(vo3, vi1, vf30, 1);
+              vo3 = vfmaq_laneq_f32(vo3, vi2, vf30, 2);
+              vo3 = vfmaq_laneq_f32(vo3, vi3, vf30, 3);
+              vo3 = vfmaq_laneq_f32(vo3, vi4, vf31, 1);
+              vo3 = vfmaq_laneq_f32(vo3, vi5, vf31, 2);
+              vo3 = vfmaq_laneq_f32(vo3, vi6, vf31, 3);
+#else
+              /* outch 0 */
+              vo0 = vmlaq_lane_f32(vo0, vi0, vget_low_f32(vf00), 0);
+              vo0 = vmlaq_lane_f32(vo0, vi1, vget_low_f32(vf00), 1);
+              vo0 = vmlaq_lane_f32(vo0, vi2, vget_high_f32(vf00), 0);
+              vo0 = vmlaq_lane_f32(vo0, vi3, vget_high_f32(vf00), 1);
+              vo0 = vmlaq_lane_f32(vo0, vi4, vget_low_f32(vf01), 1);
+              vo0 = vmlaq_lane_f32(vo0, vi5, vget_high_f32(vf01), 0);
+              vo0 = vmlaq_lane_f32(vo0, vi6, vget_high_f32(vf01), 1);
+              /* outch 1 */
+              vo1 = vmlaq_lane_f32(vo1, vi0, vget_low_f32(vf10), 0);
+              vo1 = vmlaq_lane_f32(vo1, vi1, vget_low_f32(vf10), 1);
+              vo1 = vmlaq_lane_f32(vo1, vi2, vget_high_f32(vf10), 0);
+              vo1 = vmlaq_lane_f32(vo1, vi3, vget_high_f32(vf10), 1);
+              vo1 = vmlaq_lane_f32(vo1, vi4, vget_low_f32(vf11), 1);
+              vo1 = vmlaq_lane_f32(vo1, vi5, vget_high_f32(vf11), 0);
+              vo1 = vmlaq_lane_f32(vo1, vi6, vget_high_f32(vf11), 1);
+              /* outch 2 */
+              vo2 = vmlaq_lane_f32(vo2, vi0, vget_low_f32(vf20), 0);
+              vo2 = vmlaq_lane_f32(vo2, vi1, vget_low_f32(vf20), 1);
+              vo2 = vmlaq_lane_f32(vo2, vi2, vget_high_f32(vf20), 0);
+              vo2 = vmlaq_lane_f32(vo2, vi3, vget_high_f32(vf20), 1);
+              vo2 = vmlaq_lane_f32(vo2, vi4, vget_low_f32(vf21), 1);
+              vo2 = vmlaq_lane_f32(vo2, vi5, vget_high_f32(vf21), 0);
+              vo2 = vmlaq_lane_f32(vo2, vi6, vget_high_f32(vf21), 1);
+              /* outch 3 */
+              vo3 = vmlaq_lane_f32(vo3, vi0, vget_low_f32(vf30), 0);
+              vo3 = vmlaq_lane_f32(vo3, vi1, vget_low_f32(vf30), 1);
+              vo3 = vmlaq_lane_f32(vo3, vi2, vget_high_f32(vf30), 0);
+              vo3 = vmlaq_lane_f32(vo3, vi3, vget_high_f32(vf30), 1);
+              vo3 = vmlaq_lane_f32(vo3, vi4, vget_low_f32(vf31), 1);
+              vo3 = vmlaq_lane_f32(vo3, vi5, vget_high_f32(vf31), 0);
+              vo3 = vmlaq_lane_f32(vo3, vi6, vget_high_f32(vf31), 1);
+#endif
+
+              out_ptr0_base[out_offset] = vo0[0];
+              out_ptr0_base[out_offset + out_width] = vo0[1];
+              out_ptr0_base[out_offset + 2 * out_width] = vo0[2];
+              out_ptr0_base[out_offset + 3 * out_width] = vo0[3];
+              out_ptr1_base[out_offset] = vo1[0];
+              out_ptr1_base[out_offset + out_width] = vo1[1];
+              out_ptr1_base[out_offset + 2 * out_width] = vo1[2];
+              out_ptr1_base[out_offset + 3 * out_width] = vo1[3];
+              out_ptr2_base[out_offset] = vo2[0];
+              out_ptr2_base[out_offset + out_width] = vo2[1];
+              out_ptr2_base[out_offset + 2 * out_width] = vo2[2];
+              out_ptr2_base[out_offset + 3 * out_width] = vo2[3];
+              out_ptr3_base[out_offset] = vo3[0];
+              out_ptr3_base[out_offset + out_width] = vo3[1];
+              out_ptr3_base[out_offset + 2 * out_width] = vo3[2];
+              out_ptr3_base[out_offset + 3 * out_width] = vo3[3];
+            }  // w
+          }    // h
+        }  // c
+      } else {
+        for (index_t mm = m; mm < out_channels; ++mm) {
+          float *out_ptr0_base =
+              output_data + b * out_batch_size + mm * out_image_size;
+          for (index_t c = 0; c < in_channels; ++c) {
+            const float *in_ptr_base =
+                input_data + b * in_batch_size + c * in_image_size;
+            const float
+                *filter_ptr0 = filter_data + mm * in_channels * 7 + c * 7;
+            /* load filter (1 outch x 4 height x 1 width) */
+            float32x4_t vf00, vf01;
+            vf00 = vld1q_f32(filter_ptr0);
+            vf01 = vld1q_f32(filter_ptr0 + 3);
+
+            for (index_t h = 0; h + 3 < out_height; h += 4) {
+              for (index_t w = 0; w < out_width; ++w) {
+                // load output
+                index_t out_offset = h * out_width + w;
+                // output (1 outch x 4 height x 1 width): vo_outch_height
+                float32x4_t vo0 = {out_ptr0_base[out_offset],
+                                   out_ptr0_base[out_offset + out_width],
+                                   out_ptr0_base[out_offset + 2 * out_width],
+                                   out_ptr0_base[out_offset + 3 * out_width]};
+
+                // input offset
+                index_t in_offset = h * in_width + w;
+                // input (3 slide)
+                float32x4_t vi0 = {in_ptr_base[in_offset],
+                                   in_ptr_base[in_offset + in_width],
+                                   in_ptr_base[in_offset + 2 * in_width],
+                                   in_ptr_base[in_offset + 3 * in_width]};
+                float32x4_t vi4 = {in_ptr_base[in_offset + 4 * in_width],
+                                   in_ptr_base[in_offset + 5 * in_width],
+                                   in_ptr_base[in_offset + 6 * in_width],
+                                   in_ptr_base[in_offset + 7 * in_width]};
+                float32x4_t vi8 = {in_ptr_base[in_offset + 8 * in_width],
+                                   in_ptr_base[in_offset + 9 * in_width],
+                                   in_ptr_base[in_offset + 10 * in_width],
+                                   in_ptr_base[in_offset + 11 * in_width]};
+                float32x4_t vi1 = vextq_f32(vi0, vi4, 1);
+                float32x4_t vi2 = vextq_f32(vi0, vi4, 2);
+                float32x4_t vi3 = vextq_f32(vi0, vi4, 3);
+                float32x4_t vi5 = vextq_f32(vi4, vi8, 1);
+                float32x4_t vi6 = vextq_f32(vi4, vi8, 2);
+
+#if defined(__aarch64__)
+                vo0 = vfmaq_laneq_f32(vo0, vi0, vf00, 0);
+                vo0 = vfmaq_laneq_f32(vo0, vi1, vf00, 1);
+                vo0 = vfmaq_laneq_f32(vo0, vi2, vf00, 2);
+                vo0 = vfmaq_laneq_f32(vo0, vi3, vf00, 3);
+                vo0 = vfmaq_laneq_f32(vo0, vi4, vf01, 1);
+                vo0 = vfmaq_laneq_f32(vo0, vi5, vf01, 2);
+                vo0 = vfmaq_laneq_f32(vo0, vi6, vf01, 3);
+#else
+                vo0 = vmlaq_lane_f32(vo0, vi0, vget_low_f32(vf00), 0);
+                vo0 = vmlaq_lane_f32(vo0, vi1, vget_low_f32(vf00), 1);
+                vo0 = vmlaq_lane_f32(vo0, vi2, vget_high_f32(vf00), 0);
+                vo0 = vmlaq_lane_f32(vo0, vi3, vget_high_f32(vf00), 1);
+                vo0 = vmlaq_lane_f32(vo0, vi4, vget_low_f32(vf01), 1);
+                vo0 = vmlaq_lane_f32(vo0, vi5, vget_high_f32(vf01), 0);
+                vo0 = vmlaq_lane_f32(vo0, vi6, vget_high_f32(vf01), 1);
+#endif
+
+                out_ptr0_base[out_offset] = vo0[0];
+                out_ptr0_base[out_offset + out_width] = vo0[1];
+                out_ptr0_base[out_offset + 2 * out_width] = vo0[2];
+                out_ptr0_base[out_offset + 3 * out_width] = vo0[3];
+              }  // w
+            }    // h
+          }  // c
+        }
+      }  // if
+    }    // m
+  }      // b
+  UnPadOutput(*out_tensor, output);
+  return MaceStatus::MACE_SUCCESS;
+}
+
+
+// ====
+
+MaceStatus Conv2dK1x15S1::Compute(const OpContext *context,
+                                  const Tensor *input,
+                                  const Tensor *filter,
+                                  Tensor *output) {
+  std::unique_ptr<const Tensor> padded_input;
+  std::unique_ptr<Tensor> padded_output;
+
+  ResizeOutAndPadInOut(context,
+                       input,
+                       filter,
+                       output,
+                       1,
+                       4,
+                       &padded_input,
+                       &padded_output);
+  const Tensor *in_tensor = input;
+  if (padded_input.get() != nullptr) {
+    in_tensor = padded_input.get();
+  }
+  Tensor *out_tensor = output;
+  if (padded_output.get() != nullptr) {
+    out_tensor = padded_output.get();
+  }
+  out_tensor->Clear();
+
+  Tensor::MappingGuard in_guard(input);
+  Tensor::MappingGuard filter_guard(filter);
+  Tensor::MappingGuard out_guard(output);
+  auto filter_data = filter->data<float>();
+  auto input_data = in_tensor->data<float>();
+  auto output_data = out_tensor->mutable_data<float>();
+
+  auto in_shape = in_tensor->shape();
+  auto out_shape = out_tensor->shape();
+
+  const index_t in_image_size = in_shape[2] * in_shape[3];
+  const index_t out_image_size = out_shape[2] * out_shape[3];
+  const index_t in_batch_size = in_shape[1] * in_image_size;
+  const index_t out_batch_size = out_shape[1] * out_image_size;
+  const index_t tile_height =
+      out_shape[1] < 4 ? RoundUpDiv4(out_shape[2]) : out_shape[2];
+
+#pragma omp parallel for collapse(3) schedule(runtime)
+  for (index_t b = 0; b < out_shape[0]; ++b) {
+    for (index_t m = 0; m < out_shape[1]; ++m) {
+      for (index_t h = 0; h < out_shape[2]; h += tile_height) {
+        const index_t out_height = out_shape[2];
+        const index_t out_width = out_shape[3];
+        const index_t in_channels = in_shape[1];
+        const index_t in_width = in_shape[3];
+        float *out_ptr_base =
+            output_data + b * out_batch_size + m * out_image_size;
+        for (index_t c = 0; c < in_channels; ++c) {
+          const float *in_ptr_base =
+              input_data + b * in_batch_size + c * in_image_size;
+          const float *filter_ptr = filter_data + m * in_channels * 15 + c * 15;
+          /* load filter (1 outch x 4 height x 1 width) */
+          float32x4_t vf0, vf1, vf2, vf3;
+          vf0 = vld1q_f32(filter_ptr);
+          vf1 = vld1q_f32(filter_ptr + 4);
+          vf2 = vld1q_f32(filter_ptr + 8);
+          vf3 = vld1q_f32(filter_ptr + 11);
+
+          for (index_t ht = 0; ht < tile_height && h + ht < out_height; ++ht) {
+            for (index_t w = 0; w + 3 < out_width; w += 4) {
+              // output (1 outch x 1 height x 4 width): vo_outch_height
+              float32x4_t vo;
+              // load output
+              index_t out_offset = (h + ht) * out_width + w;
+              vo = vld1q_f32(out_ptr_base + out_offset);
+
+              // input (3 slide)
+              float32x4_t vi0, vi1, vi2, vi3, vi4, vi5, vi6, vi7, vi8, vi9,
+                  vi10, vi11, vi12, vi13, vi14, vi16;
+              // input offset
+              index_t in_offset = (h + ht) * in_width + w;
+              // load input
+              vi0 = vld1q_f32(in_ptr_base + in_offset);
+              vi4 = vld1q_f32(in_ptr_base + in_offset + 4);
+              vi8 = vld1q_f32(in_ptr_base + in_offset + 8);
+              vi12 = vld1q_f32(in_ptr_base + in_offset + 12);
+              vi16 = vld1q_f32(in_ptr_base + in_offset + 16);
+              vi1 = vextq_f32(vi0, vi4, 1);
+              vi2 = vextq_f32(vi0, vi4, 2);
+              vi3 = vextq_f32(vi0, vi4, 3);
+              vi5 = vextq_f32(vi4, vi8, 1);
+              vi6 = vextq_f32(vi4, vi8, 2);
+              vi7 = vextq_f32(vi4, vi8, 3);
+              vi9 = vextq_f32(vi8, vi12, 1);
+              vi10 = vextq_f32(vi8, vi12, 2);
+              vi11 = vextq_f32(vi8, vi12, 3);
+              vi13 = vextq_f32(vi12, vi16, 1);
+              vi14 = vextq_f32(vi12, vi16, 2);
+
+              vo = vmlaq_lane_f32(vo, vi0, vget_low_f32(vf0), 0);
+              vo = vmlaq_lane_f32(vo, vi1, vget_low_f32(vf0), 1);
+              vo = vmlaq_lane_f32(vo, vi2, vget_high_f32(vf0), 0);
+              vo = vmlaq_lane_f32(vo, vi3, vget_high_f32(vf0), 1);
+              vo = vmlaq_lane_f32(vo, vi4, vget_low_f32(vf1), 0);
+              vo = vmlaq_lane_f32(vo, vi5, vget_low_f32(vf1), 1);
+              vo = vmlaq_lane_f32(vo, vi6, vget_high_f32(vf1), 0);
+              vo = vmlaq_lane_f32(vo, vi7, vget_high_f32(vf1), 1);
+              vo = vmlaq_lane_f32(vo, vi8, vget_low_f32(vf2), 0);
+              vo = vmlaq_lane_f32(vo, vi9, vget_low_f32(vf2), 1);
+              vo = vmlaq_lane_f32(vo, vi10, vget_high_f32(vf2), 0);
+              vo = vmlaq_lane_f32(vo, vi11, vget_high_f32(vf2), 1);
+              vo = vmlaq_lane_f32(vo, vi12, vget_low_f32(vf3), 1);
+              vo = vmlaq_lane_f32(vo, vi13, vget_high_f32(vf3), 0);
+              vo = vmlaq_lane_f32(vo, vi14, vget_high_f32(vf3), 1);
+
+              vst1q_f32(out_ptr_base + out_offset, vo);
+            }  // w
+          }    // ht
+        }  // c
+      }    // h
+    }      // m
+  }        // b
+  UnPadOutput(*out_tensor, output);
+  return MaceStatus::MACE_SUCCESS;
+}
+
+MaceStatus Conv2dK15x1S1::Compute(const OpContext *context,
+                                  const Tensor *input,
+                                  const Tensor *filter,
+                                  Tensor *output) {
+  std::unique_ptr<const Tensor> padded_input;
+  std::unique_ptr<Tensor> padded_output;
+
+  ResizeOutAndPadInOut(context,
+                       input,
+                       filter,
+                       output,
+                       4,
+                       1,
+                       &padded_input,
+                       &padded_output);
+  const Tensor *in_tensor = input;
+  if (padded_input.get() != nullptr) {
+    in_tensor = padded_input.get();
+  }
+  Tensor *out_tensor = output;
+  if (padded_output.get() != nullptr) {
+    out_tensor = padded_output.get();
+  }
+  out_tensor->Clear();
+
+  Tensor::MappingGuard in_guard(input);
+  Tensor::MappingGuard filter_guard(filter);
+  Tensor::MappingGuard out_guard(output);
+  auto filter_data = filter->data<float>();
+  auto input_data = in_tensor->data<float>();
+  auto output_data = out_tensor->mutable_data<float>();
+
+  auto in_shape = in_tensor->shape();
+  auto out_shape = out_tensor->shape();
+
+  const index_t in_image_size = in_shape[2] * in_shape[3];
+  const index_t out_image_size = out_shape[2] * out_shape[3];
+  const index_t in_batch_size = in_shape[1] * in_image_size;
+  const index_t out_batch_size = out_shape[1] * out_image_size;
+  const index_t tile_width =
+      out_shape[1] < 4 ? RoundUpDiv4(out_shape[3]) : out_shape[3];
+
+#pragma omp parallel for collapse(3) schedule(runtime)
+  for (index_t b = 0; b < out_shape[0]; ++b) {
+    for (index_t m = 0; m < out_shape[1]; ++m) {
+      for (index_t w = 0; w < out_shape[3]; w += tile_width) {
+        const index_t out_height = out_shape[2];
+        const index_t out_width = out_shape[3];
+        const index_t in_channels = in_shape[1];
+        const index_t in_width = in_shape[3];
+        float *out_ptr_base =
+            output_data + b * out_batch_size + m * out_image_size;
+        for (index_t c = 0; c < in_channels; ++c) {
+          const float *in_ptr_base =
+              input_data + b * in_batch_size + c * in_image_size;
+          const float *filter_ptr = filter_data + m * in_channels * 15 + c * 15;
+          /* load filter (1 outch x 4 height x 1 width) */
+          float32x4_t vf0, vf1, vf2, vf3;
+          vf0 = vld1q_f32(filter_ptr);
+          vf1 = vld1q_f32(filter_ptr + 4);
+          vf2 = vld1q_f32(filter_ptr + 8);
+          vf3 = vld1q_f32(filter_ptr + 11);
+
+          for (index_t h = 0; h + 3 < out_height; h += 4) {
+            for (index_t wt = 0; wt < tile_width && w + wt < out_width; ++wt) {
+              // load output
+              index_t out_offset = h * out_width + w + wt;
+              // output (1 outch x 4 height x 1 width): vo_outch_height
+              float32x4_t vo = {out_ptr_base[out_offset],
+                                out_ptr_base[out_offset + out_width],
+                                out_ptr_base[out_offset + 2 * out_width],
+                                out_ptr_base[out_offset + 3 * out_width]};
+
+              // input offset
+              index_t in_offset = h * in_width + w + wt;
+              // input (3 slide)
+              float32x4_t vi0 = {in_ptr_base[in_offset],
+                                 in_ptr_base[in_offset + in_width],
+                                 in_ptr_base[in_offset + 2 * in_width],
+                                 in_ptr_base[in_offset + 3 * in_width]};
+              float32x4_t vi4 = {in_ptr_base[in_offset + 4 * in_width],
+                                 in_ptr_base[in_offset + 5 * in_width],
+                                 in_ptr_base[in_offset + 6 * in_width],
+                                 in_ptr_base[in_offset + 7 * in_width]};
+              float32x4_t vi8 = {in_ptr_base[in_offset + 8 * in_width],
+                                 in_ptr_base[in_offset + 9 * in_width],
+                                 in_ptr_base[in_offset + 10 * in_width],
+                                 in_ptr_base[in_offset + 11 * in_width]};
+              float32x4_t vi12 = {in_ptr_base[in_offset + 12 * in_width],
+                                  in_ptr_base[in_offset + 13 * in_width],
+                                  in_ptr_base[in_offset + 14 * in_width],
+                                  in_ptr_base[in_offset + 15 * in_width]};
+              float32x4_t vi16 = {in_ptr_base[in_offset + 16 * in_width],
+                                  in_ptr_base[in_offset + 17 * in_width]};
+              float32x4_t vi1 = vextq_f32(vi0, vi4, 1);
+              float32x4_t vi2 = vextq_f32(vi0, vi4, 2);
+              float32x4_t vi3 = vextq_f32(vi0, vi4, 3);
+              float32x4_t vi5 = vextq_f32(vi4, vi8, 1);
+              float32x4_t vi6 = vextq_f32(vi4, vi8, 2);
+              float32x4_t vi7 = vextq_f32(vi4, vi8, 3);
+              float32x4_t vi9 = vextq_f32(vi8, vi12, 1);
+              float32x4_t vi10 = vextq_f32(vi8, vi12, 2);
+              float32x4_t vi11 = vextq_f32(vi8, vi12, 3);
+              float32x4_t vi13 = vextq_f32(vi12, vi16, 1);
+              float32x4_t vi14 = vextq_f32(vi12, vi16, 2);
+
+              vo = vmlaq_lane_f32(vo, vi0, vget_low_f32(vf0), 0);
+              vo = vmlaq_lane_f32(vo, vi1, vget_low_f32(vf0), 1);
+              vo = vmlaq_lane_f32(vo, vi2, vget_high_f32(vf0), 0);
+              vo = vmlaq_lane_f32(vo, vi3, vget_high_f32(vf0), 1);
+              vo = vmlaq_lane_f32(vo, vi4, vget_low_f32(vf1), 0);
+              vo = vmlaq_lane_f32(vo, vi5, vget_low_f32(vf1), 1);
+              vo = vmlaq_lane_f32(vo, vi6, vget_high_f32(vf1), 0);
+              vo = vmlaq_lane_f32(vo, vi7, vget_high_f32(vf1), 1);
+              vo = vmlaq_lane_f32(vo, vi8, vget_low_f32(vf2), 0);
+              vo = vmlaq_lane_f32(vo, vi9, vget_low_f32(vf2), 1);
+              vo = vmlaq_lane_f32(vo, vi10, vget_high_f32(vf2), 0);
+              vo = vmlaq_lane_f32(vo, vi11, vget_high_f32(vf2), 1);
+              vo = vmlaq_lane_f32(vo, vi12, vget_low_f32(vf3), 1);
+              vo = vmlaq_lane_f32(vo, vi13, vget_high_f32(vf3), 0);
+              vo = vmlaq_lane_f32(vo, vi14, vget_high_f32(vf3), 1);
+
+              out_ptr_base[out_offset] = vo[0];
+              out_ptr_base[out_offset + out_width] = vo[1];
+              out_ptr_base[out_offset + 2 * out_width] = vo[2];
+              out_ptr_base[out_offset + 3 * out_width] = vo[3];
+            }  // wt
+          }    // h
+        }  // c
+      }    // w
+    }      // m
+  }        // b
+  UnPadOutput(*out_tensor, output);
+  return MaceStatus::MACE_SUCCESS;
+}
+
+}  // namespace fp32
+}  // namespace arm
+}  // namespace ops
+}  // namespace mace
diff --git a/mace/ops/arm/fp32/conv_2d_1xn.h b/mace/ops/arm/fp32/conv_2d_1xn.h
new file mode 100644
index 00000000..a4a5e899
--- /dev/null
+++ b/mace/ops/arm/fp32/conv_2d_1xn.h
@@ -0,0 +1,86 @@
+// Copyright 2019 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef MACE_OPS_ARM_FP32_CONV_2D_1XN_H_
+#define MACE_OPS_ARM_FP32_CONV_2D_1XN_H_
+
+#include <vector>
+#include "mace/public/mace.h"
+#include "mace/core/tensor.h"
+#include "mace/core/op_context.h"
+#include "mace/ops/arm/fp32/conv_2d.h"
+
+namespace mace {
+namespace ops {
+namespace arm {
+namespace fp32 {
+
+class Conv2dK1x7S1 : public Conv2dBase {
+ public:
+  Conv2dK1x7S1(const std::vector<int> paddings, const Padding padding_type)
+      : Conv2dBase({1, 1}, {1, 1}, paddings, padding_type) {}
+  virtual ~Conv2dK1x7S1() {}
+
+  MaceStatus Compute(
+      const OpContext *context,
+      const Tensor *input,
+      const Tensor *filter,
+      Tensor *output);
+};
+
+class Conv2dK7x1S1 : public Conv2dBase {
+ public:
+  Conv2dK7x1S1(const std::vector<int> paddings, const Padding padding_type)
+      : Conv2dBase({1, 1}, {1, 1}, paddings, padding_type) {}
+  virtual ~Conv2dK7x1S1() {}
+
+  MaceStatus Compute(
+      const OpContext *context,
+      const Tensor *input,
+      const Tensor *filter,
+      Tensor *output);
+};
+
+class Conv2dK1x15S1 : public Conv2dBase {
+ public:
+  Conv2dK1x15S1(const std::vector<int> paddings, const Padding padding_type)
+      : Conv2dBase({1, 1}, {1, 1}, paddings, padding_type) {}
+  virtual ~Conv2dK1x15S1() {}
+
+  MaceStatus Compute(
+      const OpContext *context,
+      const Tensor *input,
+      const Tensor *filter,
+      Tensor *output);
+};
+
+class Conv2dK15x1S1 : public Conv2dBase {
+ public:
+  Conv2dK15x1S1(const std::vector<int> paddings, const Padding padding_type)
+      : Conv2dBase({1, 1}, {1, 1}, paddings, padding_type) {}
+  virtual ~Conv2dK15x1S1() {}
+
+  MaceStatus Compute(
+      const OpContext *context,
+      const Tensor *input,
+      const Tensor *filter,
+      Tensor *output);
+};
+
+}  // namespace fp32
+}  // namespace arm
+}  // namespace ops
+}  // namespace mace
+
+#endif  // MACE_OPS_ARM_FP32_CONV_2D_1XN_H_
diff --git a/mace/ops/arm/conv_2d_neon_3x3.cc b/mace/ops/arm/fp32/conv_2d_3x3.cc
similarity index 85%
rename from mace/ops/arm/conv_2d_neon_3x3.cc
rename to mace/ops/arm/fp32/conv_2d_3x3.cc
index 3555b4a5..a8ce5fa6 100644
--- a/mace/ops/arm/conv_2d_neon_3x3.cc
+++ b/mace/ops/arm/fp32/conv_2d_3x3.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 The MACE Authors. All Rights Reserved.
+// Copyright 2019 The MACE Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -12,22 +12,49 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#if defined(MACE_ENABLE_NEON)
 #include <arm_neon.h>
-#endif
-
-#include "mace/utils/macros.h"
-#include "mace/ops/arm/conv_2d_neon.h"
+#include <memory>
+#include "mace/ops/arm/fp32/conv_2d_3x3.h"
 
 namespace mace {
 namespace ops {
+namespace arm {
+namespace fp32 {
+
+MaceStatus Conv2dK3x3S1::Compute(const OpContext *context,
+                                 const Tensor *input,
+                                 const Tensor *filter,
+                                 Tensor *output) {
+  std::unique_ptr<const Tensor> padded_input;
+  std::unique_ptr<Tensor> padded_output;
+  ResizeOutAndPadInOut(context,
+                       input,
+                       filter,
+                       output,
+                       2,
+                       4,
+                       &padded_input,
+                       &padded_output);
+  const Tensor *in_tensor = input;
+  if (padded_input.get() != nullptr) {
+    in_tensor = padded_input.get();
+  }
+  Tensor *out_tensor = output;
+  if (padded_output.get() != nullptr) {
+    out_tensor = padded_output.get();
+  }
+  out_tensor->Clear();
+
+  Tensor::MappingGuard in_guard(input);
+  Tensor::MappingGuard filter_guard(filter);
+  Tensor::MappingGuard out_guard(output);
+  auto filter_data = filter->data<float>();
+  auto input_data = in_tensor->data<float>();
+  auto output_data = out_tensor->mutable_data<float>();
+
+  auto in_shape = in_tensor->shape();
+  auto out_shape = out_tensor->shape();
 
-// Ho = 2, Wo = 4, Co = 2
-void Conv2dNeonK3x3S1(const float *input,
-                      const float *filter,
-                      const index_t *in_shape,
-                      const index_t *out_shape,
-                      float *output) {
   const index_t in_image_size = in_shape[2] * in_shape[3];
   const index_t out_image_size = out_shape[2] * out_shape[3];
   const index_t in_batch_size = in_shape[1] * in_image_size;
@@ -42,26 +69,26 @@ void Conv2dNeonK3x3S1(const float *input,
       const index_t in_channels = in_shape[1];
       const index_t in_width = in_shape[3];
       if (m + 1 < out_channels) {
-        float *out_ptr0_base = output + b * out_batch_size + m * out_image_size;
-#if defined(MACE_ENABLE_NEON)
+        float *out_ptr0_base =
+            output_data + b * out_batch_size + m * out_image_size;
         float *out_ptr1_base =
-            output + b * out_batch_size + (m + 1) * out_image_size;
-#endif
+            output_data + b * out_batch_size + (m + 1) * out_image_size;
         for (index_t c = 0; c < in_channels; ++c) {
-          const float *in_ptr0 = input + b * in_batch_size + c * in_image_size;
-          const float *filter_ptr0 = filter + m * in_channels * 9 + c * 9;
+          const float
+              *in_ptr0 = input_data + b * in_batch_size + c * in_image_size;
+          const float *filter_ptr0 = filter_data + m * in_channels * 9 + c * 9;
 
-#if defined(MACE_ENABLE_NEON)
           float *out_ptr1 = out_ptr1_base;
           const float *in_ptr1 =
-              input + b * in_batch_size + c * in_image_size + 1 * in_width;
+              input_data + b * in_batch_size + c * in_image_size + 1 * in_width;
           const float *in_ptr2 =
-              input + b * in_batch_size + c * in_image_size + 2 * in_width;
+              input_data + b * in_batch_size + c * in_image_size + 2 * in_width;
           const float *in_ptr3 =
-              input + b * in_batch_size + c * in_image_size + 3 * in_width;
-          const float *filter_ptr1 = filter + (m + 1) * in_channels * 9 + c * 9;
-#endif
-#if defined(MACE_ENABLE_NEON) && defined(__aarch64__)
+              input_data + b * in_batch_size + c * in_image_size + 3 * in_width;
+          const float
+              *filter_ptr1 = filter_data + (m + 1) * in_channels * 9 + c * 9;
+
+#if defined(__aarch64__)
           float *out_ptr0 = out_ptr0_base;
 
           // load filter (2 outch x 3 height x 3 width): vf_outch_height
@@ -179,7 +206,7 @@ void Conv2dNeonK3x3S1(const float *input,
             out_ptr0 += out_width;
             out_ptr1 += out_width;
           }                      // h
-#elif defined(MACE_ENABLE_NEON)  // arm v7
+#else  // arm v7
           float *out_ptr0 = out_ptr0_base;
 
           // load filter (2 outch x 3 height x 3 width): vf_outch_height
@@ -301,32 +328,28 @@ void Conv2dNeonK3x3S1(const float *input,
             out_ptr0 += out_width;
             out_ptr1 += out_width;
           }  // h
-#else
-          for (index_t oc = 0; oc < 2; ++oc) {
-            Conv2dCPUKHxKWCalc(in_ptr0, filter_ptr0 + oc * in_channels * 9,
-                               in_width, 3, 3, out_height, out_width,
-                               out_ptr0_base + oc * out_image_size, 1);
-          }
 #endif
         }  // c
       } else {
         for (index_t mm = m; mm < out_channels; ++mm) {
           float *out_ptr0_base =
-              output + b * out_batch_size + mm * out_image_size;
+              output_data + b * out_batch_size + mm * out_image_size;
           for (index_t c = 0; c < in_channels; ++c) {
             const float *in_ptr0 =
-                input + b * in_batch_size + c * in_image_size;
-#if defined(MACE_ENABLE_NEON)
+                input_data + b * in_batch_size + c * in_image_size;
             const float *in_ptr1 =
-                input + b * in_batch_size + c * in_image_size + 1 * in_width;
+                input_data + b * in_batch_size + c * in_image_size
+                    + 1 * in_width;
             const float *in_ptr2 =
-                input + b * in_batch_size + c * in_image_size + 2 * in_width;
+                input_data + b * in_batch_size + c * in_image_size
+                    + 2 * in_width;
             const float *in_ptr3 =
-                input + b * in_batch_size + c * in_image_size + 3 * in_width;
-#endif
-            const float *filter_ptr0 = filter + mm * in_channels * 9 + c * 9;
+                input_data + b * in_batch_size + c * in_image_size
+                    + 3 * in_width;
+            const float
+                *filter_ptr0 = filter_data + mm * in_channels * 9 + c * 9;
 
-#if defined(MACE_ENABLE_NEON) && defined(__aarch64__)
+#if defined(__aarch64__)
             float *out_ptr0 = out_ptr0_base;
 
             // load filter (1 outch x 3 height x 3 width): vf_outch_height
@@ -409,7 +432,7 @@ void Conv2dNeonK3x3S1(const float *input,
 
               out_ptr0 += out_width;
             }                    // h
-#elif defined(MACE_ENABLE_NEON)  // arm v7
+#else  // arm v7
             float *out_ptr0 = out_ptr0_base;
 
             // load filter (1 outch x 3 height x 3 width): vf_outch_height
@@ -494,22 +517,52 @@ void Conv2dNeonK3x3S1(const float *input,
 
               out_ptr0 += out_width;
             }  // h
-#else
-            Conv2dCPUKHxKWCalc(in_ptr0, filter_ptr0, in_width, 3, 3, out_height,
-                               out_width, out_ptr0_base, 1);
 #endif
           }  // c
         }    // mm
       }      // if
     }        // m
   }          // b
+
+  UnPadOutput(*out_tensor, output);
+  return MaceStatus::MACE_SUCCESS;
 }
 
-void Conv2dNeonK3x3S2(const float *input,
-                      const float *filter,
-                      const index_t *in_shape,
-                      const index_t *out_shape,
-                      float *output) {
+MaceStatus Conv2dK3x3S2::Compute(const OpContext *context,
+                                 const Tensor *input,
+                                 const Tensor *filter,
+                                 Tensor *output) {
+  std::unique_ptr<const Tensor> padded_input;
+  std::unique_ptr<Tensor> padded_output;
+
+  ResizeOutAndPadInOut(context,
+                       input,
+                       filter,
+                       output,
+                       1,
+                       4,
+                       &padded_input,
+                       &padded_output);
+  const Tensor *in_tensor = input;
+  if (padded_input.get() != nullptr) {
+    in_tensor = padded_input.get();
+  }
+  Tensor *out_tensor = output;
+  if (padded_output.get() != nullptr) {
+    out_tensor = padded_output.get();
+  }
+  out_tensor->Clear();
+
+  Tensor::MappingGuard in_guard(input);
+  Tensor::MappingGuard filter_guard(filter);
+  Tensor::MappingGuard out_guard(output);
+  auto filter_data = filter->data<float>();
+  auto input_data = in_tensor->data<float>();
+  auto output_data = out_tensor->mutable_data<float>();
+
+  auto in_shape = in_tensor->shape();
+  auto out_shape = out_tensor->shape();
+
   const index_t in_image_size = in_shape[2] * in_shape[3];
   const index_t out_image_size = out_shape[2] * out_shape[3];
   const index_t in_batch_size = in_shape[1] * in_image_size;
@@ -523,11 +576,12 @@ void Conv2dNeonK3x3S2(const float *input,
         const index_t in_width = in_shape[3];
         const index_t out_height = out_shape[2];
         const index_t out_width = out_shape[3];
-        const float *in_base = input + b * in_batch_size + c * in_image_size;
-        const float *filter_ptr = filter + m * in_channels * 9 + c * 9;
-        float *out_base = output + b * out_batch_size + m * out_image_size;
+        const float
+            *in_base = input_data + b * in_batch_size + c * in_image_size;
+        const float *filter_ptr = filter_data + m * in_channels * 9 + c * 9;
+        float *out_base = output_data + b * out_batch_size + m * out_image_size;
 
-#if defined(MACE_ENABLE_NEON) && defined(__aarch64__)
+#if defined(__aarch64__)
         // load filter (1 outch x 3 height x 3 width): vf_outch_height
         float32x4_t vf00, vf01, vf02;
         vf00 = vld1q_f32(filter_ptr);
@@ -587,7 +641,7 @@ void Conv2dNeonK3x3S2(const float *input,
             vst1q_f32(out_base + out_offset, vo);
           }                      // w
         }                        // h
-#elif defined(MACE_ENABLE_NEON)  // arm v7
+#else  // arm v7
         // load filter (1 outch x 3 height x 3 width): vf_outch_height
         float32x2_t vf01, vf23, vf45, vf67, vf78;
         vf01 = vld1_f32(filter_ptr);
@@ -649,14 +703,16 @@ void Conv2dNeonK3x3S2(const float *input,
             vst1q_f32(out_base + out_offset, vo);
           }  // w
         }    // h
-#else
-        Conv2dCPUKHxKWCalc(in_base, filter_ptr, in_width, 3, 3, out_height,
-                           out_width, out_base, 2);
 #endif
       }  // c
     }    // m
   }      // b
+
+  UnPadOutput(*out_tensor, output);
+  return MaceStatus::MACE_SUCCESS;
 }
 
+}  // namespace fp32
+}  // namespace arm
 }  // namespace ops
 }  // namespace mace
diff --git a/mace/ops/arm/fp32/conv_2d_3x3.h b/mace/ops/arm/fp32/conv_2d_3x3.h
new file mode 100644
index 00000000..66d47801
--- /dev/null
+++ b/mace/ops/arm/fp32/conv_2d_3x3.h
@@ -0,0 +1,60 @@
+// Copyright 2019 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef MACE_OPS_ARM_FP32_CONV_2D_3X3_H_
+#define MACE_OPS_ARM_FP32_CONV_2D_3X3_H_
+
+#include <vector>
+#include "mace/public/mace.h"
+#include "mace/core/tensor.h"
+#include "mace/core/op_context.h"
+#include "mace/ops/arm/fp32/conv_2d.h"
+
+namespace mace {
+namespace ops {
+namespace arm {
+namespace fp32 {
+
+class Conv2dK3x3S1 : public Conv2dBase {
+ public:
+  Conv2dK3x3S1(const std::vector<int> paddings, const Padding padding_type)
+      : Conv2dBase({1, 1}, {1, 1}, paddings, padding_type) {}
+  virtual ~Conv2dK3x3S1() {}
+
+  MaceStatus Compute(
+      const OpContext *context,
+      const Tensor *input,
+      const Tensor *filter,
+      Tensor *output);
+};
+
+class Conv2dK3x3S2 : public Conv2dBase {
+ public:
+  Conv2dK3x3S2(const std::vector<int> paddings, const Padding padding_type)
+      : Conv2dBase({2, 2}, {1, 1}, paddings, padding_type) {}
+  virtual ~Conv2dK3x3S2() {}
+
+  MaceStatus Compute(
+      const OpContext *context,
+      const Tensor *input,
+      const Tensor *filter,
+      Tensor *output);
+};
+
+}  // namespace fp32
+}  // namespace arm
+}  // namespace ops
+}  // namespace mace
+
+#endif  // MACE_OPS_ARM_FP32_CONV_2D_3X3_H_
diff --git a/mace/ops/arm/fp32/conv_2d_3x3_winograd.cc b/mace/ops/arm/fp32/conv_2d_3x3_winograd.cc
index 84f12125..b894a60a 100644
--- a/mace/ops/arm/fp32/conv_2d_3x3_winograd.cc
+++ b/mace/ops/arm/fp32/conv_2d_3x3_winograd.cc
@@ -34,13 +34,6 @@ MaceStatus Conv2dK3x3Winograd::Compute(const OpContext *context,
   const index_t in_width = input->dim(3);
   const index_t out_channels = filter->dim(0);
 
-  index_t padded_in_height = in_height + pad_top_ + pad_bottom_;
-  index_t padded_in_width = in_width + pad_left_ + pad_right_;
-
-  index_t out_height = padded_in_height - 2;
-  index_t out_width = padded_in_width - 2;
-  output->Resize({batch, out_channels, out_height, out_width});
-
   // When size of input feature map is bigger than 16x16,
   // set winograd out tile size to 6 to get higher performance.
   index_t out_tile_size = 2;
@@ -48,10 +41,35 @@ MaceStatus Conv2dK3x3Winograd::Compute(const OpContext *context,
     out_tile_size = 6;
   }
 
-  const index_t padded_out_height = RoundUp<index_t>(out_height, out_tile_size);
-  const index_t padded_out_width = RoundUp<index_t>(out_width, out_tile_size);
-  padded_in_height = std::max(padded_in_height, padded_out_height + 2);
-  padded_in_width = std::max(padded_in_width, padded_out_width + 2);
+  std::vector<index_t> output_shape;
+  std::vector<int> in_pad_size;
+  std::vector<int> out_pad_size;
+  CalOutputShapeAndPadSize(input,
+                           filter,
+                           out_tile_size,
+                           out_tile_size,
+                           &output_shape,
+                           &in_pad_size,
+                           &out_pad_size);
+  MACE_RETURN_IF_ERROR(output->Resize(output_shape));
+
+  Tensor::MappingGuard filter_guard(filter);
+  Tensor::MappingGuard input_guard(input);
+  Tensor::MappingGuard output_guard(output);
+
+  const index_t out_height = output_shape[2];
+  const index_t out_width = output_shape[3];
+  const index_t padded_in_height = in_height + in_pad_size[0] + in_pad_size[1];
+  const index_t padded_in_width = in_width + in_pad_size[2] + in_pad_size[3];
+  const index_t
+      padded_out_height = out_height + out_pad_size[0] + out_pad_size[1];
+  const index_t
+      padded_out_width = out_width + out_pad_size[2] + out_pad_size[3];
+  const int pad_top = in_pad_size[0];
+  const int pad_left = in_pad_size[2];
+
+  bool is_in_padded =
+      padded_in_height != in_height || padded_in_width != in_width;
   bool is_out_padded =
       padded_out_height != out_height || padded_out_width != out_width;
 
@@ -63,8 +81,9 @@ MaceStatus Conv2dK3x3Winograd::Compute(const OpContext *context,
 
   // pad input and transform input
   auto scratch_buffer = context->device()->scratch_buffer();
-  const index_t padded_in_size = PadAlignSize(
-      sizeof(float) * batch * in_channels * padded_in_height * padded_in_width);
+  const index_t padded_in_size = is_in_padded ? PadAlignSize(
+      sizeof(float) * batch * in_channels * padded_in_height
+          * padded_in_width) : 0;
   const index_t padded_out_size = is_out_padded ? PadAlignSize(
       sizeof(float) * batch * out_channels * padded_out_height
           * padded_out_width) : 0;
@@ -81,8 +100,18 @@ MaceStatus Conv2dK3x3Winograd::Compute(const OpContext *context,
   scratch_buffer->GrowSize(
       padded_in_size + padded_out_size + transformed_in_size
           + transformed_out_size + gemm_pack_size);
-  Tensor padded_in(scratch_buffer->Scratch(padded_in_size), DataType::DT_FLOAT);
-  padded_in.Resize({batch, in_channels, padded_in_height, padded_in_width});
+
+  const Tensor *padded_in = input;
+  Tensor tmp_padded_in
+      (scratch_buffer->Scratch(padded_in_size), DataType::DT_FLOAT);
+  if (is_in_padded) {
+    tmp_padded_in.Resize({batch, in_channels, padded_in_height,
+                          padded_in_width});
+    Tensor::MappingGuard guard(&tmp_padded_in);
+    PadInput(*input, pad_top, pad_left, &tmp_padded_in);
+    padded_in = &tmp_padded_in;
+  }
+
   Tensor *padded_out = output;
   Tensor tmp_padded_out
       (scratch_buffer->Scratch(padded_out_size), DataType::DT_FLOAT);
@@ -94,22 +123,10 @@ MaceStatus Conv2dK3x3Winograd::Compute(const OpContext *context,
 
   auto transformed_in = scratch_buffer->Scratch(transformed_in_size);
   auto transformed_out = scratch_buffer->Scratch(transformed_out_size);
-
-  auto padded_in_data = padded_in.data<float>();
+  auto padded_in_data = padded_in->data<float>();
   auto padded_out_data = padded_out->mutable_data<float>();
   auto transformed_in_data = transformed_in.mutable_data<float>();
   auto transformed_out_data = transformed_out.mutable_data<float>();
-
-  const index_t padded_bottom = padded_in_height - in_height - pad_top_;
-  const index_t padded_right = padded_in_width - in_width - pad_left_;
-  ConstructNCHWInputWithSpecificPadding(input,
-                                        pad_top_,
-                                        padded_bottom,
-                                        pad_left_,
-                                        padded_right,
-                                        &padded_in);
-
-  Tensor::MappingGuard filter_guard(filter);
   auto filter_data = filter->data<float>();
 
   if (!filter->is_weight() || out_tile_size != out_tile_size_) {
@@ -215,47 +232,11 @@ MaceStatus Conv2dK3x3Winograd::Compute(const OpContext *context,
     default:MACE_NOT_IMPLEMENTED;
   }
 
-  if (is_out_padded) {
-    UnPackOutput(*padded_out, output);
-  }
+  UnPadOutput(*padded_out, output);
 
   return MaceStatus::MACE_SUCCESS;
 }
 
-void Conv2dK3x3Winograd::UnPackOutput(const Tensor &src, Tensor *dst) {
-  const index_t batch = dst->dim(0);
-  const index_t channels = dst->dim(1);
-  const index_t height = dst->dim(2);
-  const index_t width = dst->dim(3);
-  const index_t padded_height = src.dim(2);
-  const index_t padded_width = src.dim(3);
-
-  if (height == padded_height && width == padded_width) {
-    return;
-  }
-
-  auto padded_out_data = src.data<float>();
-  auto out_data = dst->mutable_data<float>();
-
-  const index_t img_size = height * width;
-  const index_t padded_img_size = padded_height * padded_width;
-
-#pragma omp parallel for collapse(3) schedule(runtime)
-  for (index_t b = 0; b < batch; ++b) {
-    for (index_t c = 0; c < channels; ++c) {
-      for (index_t h = 0; h < height; ++h) {
-        memcpy(
-            out_data + (b * channels + c) * img_size
-                + h * width,
-            padded_out_data
-                + (b * channels + c) * padded_img_size
-                + h * padded_width,
-            sizeof(float) * width);
-      }  // h
-    }  // c
-  }  // b
-}
-
 // OCHW => TOC
 void Conv2dK3x3Winograd::TransformFilter4x4(const float *filter,
                                             const index_t in_channels,
diff --git a/mace/ops/arm/fp32/conv_2d_3x3_winograd.h b/mace/ops/arm/fp32/conv_2d_3x3_winograd.h
index bd5d3e0c..3ed8646b 100644
--- a/mace/ops/arm/fp32/conv_2d_3x3_winograd.h
+++ b/mace/ops/arm/fp32/conv_2d_3x3_winograd.h
@@ -15,6 +15,7 @@
 #ifndef MACE_OPS_ARM_FP32_CONV_2D_3X3_WINOGRAD_H_
 #define MACE_OPS_ARM_FP32_CONV_2D_3X3_WINOGRAD_H_
 
+#include <vector>
 #include <memory>
 
 #include "mace/public/mace.h"
@@ -30,12 +31,10 @@ namespace fp32 {
 
 class Conv2dK3x3Winograd : public Conv2dBase {
  public:
-  Conv2dK3x3Winograd(int pad_top, int pad_bottom, int pad_left, int pad_right)
-      : gemm_(),
-        pad_top_(pad_top),
-        pad_bottom_(pad_bottom),
-        pad_left_(pad_left),
-        pad_right_(pad_right),
+  Conv2dK3x3Winograd(const std::vector<int> paddings,
+                     const Padding padding_type)
+      : Conv2dBase({1, 1}, {1, 1}, paddings, padding_type),
+        gemm_(),
         transformed_filter_(nullptr),
         out_tile_size_(0) {}
 
@@ -48,9 +47,6 @@ class Conv2dK3x3Winograd : public Conv2dBase {
       Tensor *output);
 
  private:
-  void UnPackOutput(const Tensor &padded_output,
-                    Tensor *output);
-
   void TransformFilter4x4(const float *filter,
                           const index_t in_channels,
                           const index_t out_channels,
@@ -94,10 +90,6 @@ class Conv2dK3x3Winograd : public Conv2dBase {
                           float *output);
 
   Gemm gemm_;
-  int pad_top_;
-  int pad_bottom_;
-  int pad_left_;
-  int pad_right_;
   std::unique_ptr<Tensor> transformed_filter_;
   index_t out_tile_size_;
 };
diff --git a/mace/ops/arm/conv_2d_neon_5x5.cc b/mace/ops/arm/fp32/conv_2d_5x5.cc
similarity index 77%
rename from mace/ops/arm/conv_2d_neon_5x5.cc
rename to mace/ops/arm/fp32/conv_2d_5x5.cc
index 81d89297..264e48fa 100644
--- a/mace/ops/arm/conv_2d_neon_5x5.cc
+++ b/mace/ops/arm/fp32/conv_2d_5x5.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 The MACE Authors. All Rights Reserved.
+// Copyright 2019 The MACE Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -12,14 +12,14 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#if defined(MACE_ENABLE_NEON)
 #include <arm_neon.h>
-#endif
-
-#include "mace/ops/arm/conv_2d_neon.h"
+#include <memory>
+#include "mace/ops/arm/fp32/conv_2d_5x5.h"
 
 namespace mace {
 namespace ops {
+namespace arm {
+namespace fp32 {
 
 #define MACE_Conv2dNeonK5x5SnLoadCalc4                    \
   /* load filter (4 outch x 1 height x 4 width) */        \
@@ -76,12 +76,40 @@ namespace ops {
   vo0 = vmlaq_lane_f32(vo0, vi3, vget_high_f32(vf00), 1); \
   vo0 = vmlaq_lane_f32(vo0, vi4, vf01, 1);
 
-// Ho = 1, Wo = 4, Co = 4
-void Conv2dNeonK5x5S1(const float *input,
-                      const float *filter,
-                      const index_t *in_shape,
-                      const index_t *out_shape,
-                      float *output) {
+MaceStatus Conv2dK5x5S1::Compute(const OpContext *context,
+                                 const Tensor *input,
+                                 const Tensor *filter,
+                                 Tensor *output) {
+  std::unique_ptr<const Tensor> padded_input;
+  std::unique_ptr<Tensor> padded_output;
+  ResizeOutAndPadInOut(context,
+                       input,
+                       filter,
+                       output,
+                       1,
+                       4,
+                       &padded_input,
+                       &padded_output);
+  const Tensor *in_tensor = input;
+  if (padded_input.get() != nullptr) {
+    in_tensor = padded_input.get();
+  }
+  Tensor *out_tensor = output;
+  if (padded_output.get() != nullptr) {
+    out_tensor = padded_output.get();
+  }
+  out_tensor->Clear();
+
+  Tensor::MappingGuard in_guard(input);
+  Tensor::MappingGuard filter_guard(filter);
+  Tensor::MappingGuard out_guard(output);
+  auto filter_data = filter->data<float>();
+  auto input_data = in_tensor->data<float>();
+  auto output_data = out_tensor->mutable_data<float>();
+
+  auto in_shape = in_tensor->shape();
+  auto out_shape = out_tensor->shape();
+
   const index_t in_image_size = in_shape[2] * in_shape[3];
   const index_t out_image_size = out_shape[2] * out_shape[3];
   const index_t in_batch_size = in_shape[1] * in_image_size;
@@ -96,26 +124,26 @@ void Conv2dNeonK5x5S1(const float *input,
       const index_t in_channels = in_shape[1];
       const index_t in_width = in_shape[3];
       if (m + 3 < out_channels) {
-        float *out_ptr0_base = output + b * out_batch_size + m * out_image_size;
-#if defined(MACE_ENABLE_NEON) && !defined(__aarch64__)
+        float *out_ptr0_base =
+            output_data + b * out_batch_size + m * out_image_size;
         float *out_ptr1_base =
-            output + b * out_batch_size + (m + 1) * out_image_size;
+            output_data + b * out_batch_size + (m + 1) * out_image_size;
         float *out_ptr2_base =
-            output + b * out_batch_size + (m + 2) * out_image_size;
+            output_data + b * out_batch_size + (m + 2) * out_image_size;
         float *out_ptr3_base =
-            output + b * out_batch_size + (m + 3) * out_image_size;
-#endif
+            output_data + b * out_batch_size + (m + 3) * out_image_size;
+
         for (index_t c = 0; c < in_channels; ++c) {
           const float *in_ptr_base =
-              input + b * in_batch_size + c * in_image_size;
-          const float *filter_ptr0 = filter + m * in_channels * 25 + c * 25;
-#if defined(MACE_ENABLE_NEON) && !defined(__aarch64__)
+              input_data + b * in_batch_size + c * in_image_size;
+          const float
+              *filter_ptr0 = filter_data + m * in_channels * 25 + c * 25;
           const float *filter_ptr1 =
-              filter + (m + 1) * in_channels * 25 + c * 25;
+              filter_data + (m + 1) * in_channels * 25 + c * 25;
           const float *filter_ptr2 =
-              filter + (m + 2) * in_channels * 25 + c * 25;
+              filter_data + (m + 2) * in_channels * 25 + c * 25;
           const float *filter_ptr3 =
-              filter + (m + 3) * in_channels * 25 + c * 25;
+              filter_data + (m + 3) * in_channels * 25 + c * 25;
           for (index_t h = 0; h < out_height; ++h) {
             for (index_t w = 0; w + 3 < out_width; w += 4) {
               // input offset
@@ -158,23 +186,16 @@ void Conv2dNeonK5x5S1(const float *input,
               filter_ptr3 -= 25;
             }  // w
           }    // h
-#else
-          for (index_t oc = 0; oc < 4; ++oc) {
-            Conv2dCPUKHxKWCalc(in_ptr_base, filter_ptr0 + oc * in_channels * 25,
-                               in_width, 5, 5, out_height, out_width,
-                               out_ptr0_base + oc * out_image_size, 1);
-          }
-#endif
         }  // c
       } else {
         for (index_t mm = m; mm < out_channels; ++mm) {
           float *out_ptr0_base =
-              output + b * out_batch_size + mm * out_image_size;
+              output_data + b * out_batch_size + mm * out_image_size;
           for (index_t c = 0; c < in_channels; ++c) {
             const float *in_ptr_base =
-                input + b * in_batch_size + c * in_image_size;
-            const float *filter_ptr0 = filter + mm * in_channels * 25 + c * 25;
-#if defined(MACE_ENABLE_NEON) && !defined(__aarch64__)
+                input_data + b * in_batch_size + c * in_image_size;
+            const float
+                *filter_ptr0 = filter_data + mm * in_channels * 25 + c * 25;
             for (index_t h = 0; h < out_height; ++h) {
               for (index_t w = 0; w + 3 < out_width; w += 4) {
                 // input offset
@@ -204,16 +225,17 @@ void Conv2dNeonK5x5S1(const float *input,
                 filter_ptr0 -= 25;
               }  // w
             }    // h
-#else
-            Conv2dCPUKHxKWCalc(in_ptr_base, filter_ptr0, in_width, 5, 5,
-                               out_height, out_width, out_ptr0_base, 1);
-#endif
           }  // c
         }    // mm
       }      // if
     }        // m
   }          // b
+
+  UnPadOutput(*out_tensor, output);
+  return MaceStatus::MACE_SUCCESS;
 }
 
+}  // namespace fp32
+}  // namespace arm
 }  // namespace ops
 }  // namespace mace
diff --git a/mace/ops/arm/fp32/conv_2d_5x5.h b/mace/ops/arm/fp32/conv_2d_5x5.h
new file mode 100644
index 00000000..154d74a8
--- /dev/null
+++ b/mace/ops/arm/fp32/conv_2d_5x5.h
@@ -0,0 +1,48 @@
+// Copyright 2019 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef MACE_OPS_ARM_FP32_CONV_2D_5X5_H_
+#define MACE_OPS_ARM_FP32_CONV_2D_5X5_H_
+
+#include <vector>
+#include "mace/public/mace.h"
+#include "mace/core/tensor.h"
+#include "mace/core/op_context.h"
+#include "mace/ops/arm/fp32/conv_2d.h"
+
+namespace mace {
+namespace ops {
+namespace arm {
+namespace fp32 {
+
+class Conv2dK5x5S1 : public Conv2dBase {
+ public:
+  Conv2dK5x5S1(const std::vector<int> paddings, const Padding padding_type)
+      : Conv2dBase({1, 1}, {1, 1}, paddings, padding_type) {}
+  virtual ~Conv2dK5x5S1() {}
+
+  MaceStatus Compute(
+      const OpContext *context,
+      const Tensor *input,
+      const Tensor *filter,
+      Tensor *output);
+};
+
+
+}  // namespace fp32
+}  // namespace arm
+}  // namespace ops
+}  // namespace mace
+
+#endif  // MACE_OPS_ARM_FP32_CONV_2D_5X5_H_
diff --git a/mace/ops/arm/conv_2d_neon_7x7.cc b/mace/ops/arm/fp32/conv_2d_7x7.cc
similarity index 78%
rename from mace/ops/arm/conv_2d_neon_7x7.cc
rename to mace/ops/arm/fp32/conv_2d_7x7.cc
index 2411aad6..86d3e468 100644
--- a/mace/ops/arm/conv_2d_neon_7x7.cc
+++ b/mace/ops/arm/fp32/conv_2d_7x7.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 The MACE Authors. All Rights Reserved.
+// Copyright 2019 The MACE Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -12,14 +12,14 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#if defined(MACE_ENABLE_NEON)
 #include <arm_neon.h>
-#endif
-
-#include "mace/ops/arm/conv_2d_neon.h"
+#include <memory>
+#include "mace/ops/arm/fp32/conv_2d_7x7.h"
 
 namespace mace {
 namespace ops {
+namespace arm {
+namespace fp32 {
 
 #define MACE_Conv2dArmv8NeonK7x7SnLoadCalc4        \
   /* load filter (4 outch x 1 height x 4 width) */ \
@@ -153,12 +153,40 @@ namespace ops {
   vo0 = vmlaq_lane_f32(vo0, vi5, vget_high_f32(vf01), 0); \
   vo0 = vmlaq_lane_f32(vo0, vi6, vget_high_f32(vf01), 1);
 
-// Ho = 1, Wo = 4, Co = 4
-void Conv2dNeonK7x7S1(const float *input,
-                      const float *filter,
-                      const index_t *in_shape,
-                      const index_t *out_shape,
-                      float *output) {
+MaceStatus Conv2dK7x7S1::Compute(const OpContext *context,
+                                 const Tensor *input,
+                                 const Tensor *filter,
+                                 Tensor *output) {
+  std::unique_ptr<const Tensor> padded_input;
+  std::unique_ptr<Tensor> padded_output;
+  ResizeOutAndPadInOut(context,
+                       input,
+                       filter,
+                       output,
+                       1,
+                       4,
+                       &padded_input,
+                       &padded_output);
+  const Tensor *in_tensor = input;
+  if (padded_input.get() != nullptr) {
+    in_tensor = padded_input.get();
+  }
+  Tensor *out_tensor = output;
+  if (padded_output.get() != nullptr) {
+    out_tensor = padded_output.get();
+  }
+  out_tensor->Clear();
+
+  Tensor::MappingGuard in_guard(input);
+  Tensor::MappingGuard filter_guard(filter);
+  Tensor::MappingGuard out_guard(output);
+  auto filter_data = filter->data<float>();
+  auto input_data = in_tensor->data<float>();
+  auto output_data = out_tensor->mutable_data<float>();
+
+  auto in_shape = in_tensor->shape();
+  auto out_shape = out_tensor->shape();
+
   const index_t in_image_size = in_shape[2] * in_shape[3];
   const index_t out_image_size = out_shape[2] * out_shape[3];
   const index_t in_batch_size = in_shape[1] * in_image_size;
@@ -173,26 +201,25 @@ void Conv2dNeonK7x7S1(const float *input,
       const index_t in_channels = in_shape[1];
       const index_t in_width = in_shape[3];
       if (m + 3 < out_channels) {
-        float *out_ptr0_base = output + b * out_batch_size + m * out_image_size;
-#if defined(MACE_ENABLE_NEON)
+        float *out_ptr0_base =
+            output_data + b * out_batch_size + m * out_image_size;
         float *out_ptr1_base =
-            output + b * out_batch_size + (m + 1) * out_image_size;
+            output_data + b * out_batch_size + (m + 1) * out_image_size;
         float *out_ptr2_base =
-            output + b * out_batch_size + (m + 2) * out_image_size;
+            output_data + b * out_batch_size + (m + 2) * out_image_size;
         float *out_ptr3_base =
-            output + b * out_batch_size + (m + 3) * out_image_size;
-#endif
+            output_data + b * out_batch_size + (m + 3) * out_image_size;
         for (index_t c = 0; c < in_channels; ++c) {
           const float *in_ptr_base =
-              input + b * in_batch_size + c * in_image_size;
-          const float *filter_ptr0 = filter + m * in_channels * 49 + c * 49;
-#if defined(MACE_ENABLE_NEON)
+              input_data + b * in_batch_size + c * in_image_size;
+          const float
+              *filter_ptr0 = filter_data + m * in_channels * 49 + c * 49;
           const float *filter_ptr1 =
-              filter + (m + 1) * in_channels * 49 + c * 49;
+              filter_data + (m + 1) * in_channels * 49 + c * 49;
           const float *filter_ptr2 =
-              filter + (m + 2) * in_channels * 49 + c * 49;
+              filter_data + (m + 2) * in_channels * 49 + c * 49;
           const float *filter_ptr3 =
-              filter + (m + 3) * in_channels * 49 + c * 49;
+              filter_data + (m + 3) * in_channels * 49 + c * 49;
           for (index_t h = 0; h < out_height; ++h) {
             for (index_t w = 0; w + 3 < out_width; w += 4) {
               // input offset
@@ -243,23 +270,16 @@ void Conv2dNeonK7x7S1(const float *input,
               filter_ptr3 -= 49;
             }  // w
           }    // h
-#else
-          for (index_t oc = 0; oc < 4; ++oc) {
-            Conv2dCPUKHxKWCalc(in_ptr_base, filter_ptr0 + oc * in_channels * 49,
-                               in_width, 7, 7, out_height, out_width,
-                               out_ptr0_base + oc * out_image_size, 1);
-          }
-#endif
         }  // c
       } else {
         for (index_t mm = m; mm < out_channels; ++mm) {
           float *out_ptr0_base =
-              output + b * out_batch_size + mm * out_image_size;
+              output_data + b * out_batch_size + mm * out_image_size;
           for (index_t c = 0; c < in_channels; ++c) {
             const float *in_ptr_base =
-                input + b * in_batch_size + c * in_image_size;
-            const float *filter_ptr0 = filter + mm * in_channels * 49 + c * 49;
-#if defined(MACE_ENABLE_NEON)
+                input_data + b * in_batch_size + c * in_image_size;
+            const float
+                *filter_ptr0 = filter_data + mm * in_channels * 49 + c * 49;
             for (index_t h = 0; h < out_height; ++h) {
               for (index_t w = 0; w + 3 < out_width; w += 4) {
                 // input offset
@@ -297,23 +317,50 @@ void Conv2dNeonK7x7S1(const float *input,
                 filter_ptr0 -= 49;
               }  // w
             }    // h
-#else
-            Conv2dCPUKHxKWCalc(in_ptr_base, filter_ptr0, in_width, 7, 7,
-                               out_height, out_width, out_ptr0_base, 1);
-#endif
           }  // c
         }    // mm
       }      // if
     }        // m
   }          // b
+
+  UnPadOutput(*out_tensor, output);
+  return MaceStatus::MACE_SUCCESS;
 }
 
-// Ho = 1, Wo = 4, Co = 4
-void Conv2dNeonK7x7S2(const float *input,
-                      const float *filter,
-                      const index_t *in_shape,
-                      const index_t *out_shape,
-                      float *output) {
+MaceStatus Conv2dK7x7S2::Compute(const OpContext *context,
+                                 const Tensor *input,
+                                 const Tensor *filter,
+                                 Tensor *output) {
+  std::unique_ptr<const Tensor> padded_input;
+  std::unique_ptr<Tensor> padded_output;
+  ResizeOutAndPadInOut(context,
+                       input,
+                       filter,
+                       output,
+                       1,
+                       4,
+                       &padded_input,
+                       &padded_output);
+  const Tensor *in_tensor = input;
+  if (padded_input.get() != nullptr) {
+    in_tensor = padded_input.get();
+  }
+  Tensor *out_tensor = output;
+  if (padded_output.get() != nullptr) {
+    out_tensor = padded_output.get();
+  }
+  out_tensor->Clear();
+
+  Tensor::MappingGuard in_guard(input);
+  Tensor::MappingGuard filter_guard(filter);
+  Tensor::MappingGuard out_guard(output);
+  auto filter_data = filter->data<float>();
+  auto input_data = in_tensor->data<float>();
+  auto output_data = out_tensor->mutable_data<float>();
+
+  auto in_shape = in_tensor->shape();
+  auto out_shape = out_tensor->shape();
+
   const index_t in_image_size = in_shape[2] * in_shape[3];
   const index_t out_image_size = out_shape[2] * out_shape[3];
   const index_t in_batch_size = in_shape[1] * in_image_size;
@@ -328,26 +375,25 @@ void Conv2dNeonK7x7S2(const float *input,
       const index_t in_channels = in_shape[1];
       const index_t in_width = in_shape[3];
       if (m + 3 < out_channels) {
-        float *out_ptr0_base = output + b * out_batch_size + m * out_image_size;
-#if defined(MACE_ENABLE_NEON)
+        float *out_ptr0_base =
+            output_data + b * out_batch_size + m * out_image_size;
         float *out_ptr1_base =
-            output + b * out_batch_size + (m + 1) * out_image_size;
+            output_data + b * out_batch_size + (m + 1) * out_image_size;
         float *out_ptr2_base =
-            output + b * out_batch_size + (m + 2) * out_image_size;
+            output_data + b * out_batch_size + (m + 2) * out_image_size;
         float *out_ptr3_base =
-            output + b * out_batch_size + (m + 3) * out_image_size;
-#endif
+            output_data + b * out_batch_size + (m + 3) * out_image_size;
         for (index_t c = 0; c < in_channels; ++c) {
           const float *in_ptr_base =
-              input + b * in_batch_size + c * in_image_size;
-          const float *filter_ptr0 = filter + m * in_channels * 49 + c * 49;
-#if defined(MACE_ENABLE_NEON)
+              input_data + b * in_batch_size + c * in_image_size;
+          const float
+              *filter_ptr0 = filter_data + m * in_channels * 49 + c * 49;
           const float *filter_ptr1 =
-              filter + (m + 1) * in_channels * 49 + c * 49;
+              filter_data + (m + 1) * in_channels * 49 + c * 49;
           const float *filter_ptr2 =
-              filter + (m + 2) * in_channels * 49 + c * 49;
+              filter_data + (m + 2) * in_channels * 49 + c * 49;
           const float *filter_ptr3 =
-              filter + (m + 3) * in_channels * 49 + c * 49;
+              filter_data + (m + 3) * in_channels * 49 + c * 49;
           for (index_t h = 0; h < out_height; ++h) {
             for (index_t w = 0; w + 3 < out_width; w += 4) {
               // input offset
@@ -403,23 +449,16 @@ void Conv2dNeonK7x7S2(const float *input,
               filter_ptr3 -= 49;
             }  // w
           }    // h
-#else
-          for (index_t oc = 0; oc < 4; ++oc) {
-            Conv2dCPUKHxKWCalc(in_ptr_base, filter_ptr0 + oc * in_channels * 49,
-                               in_width, 7, 7, out_height, out_width,
-                               out_ptr0_base + oc * out_image_size, 2);
-          }
-#endif
         }  // c
       } else {
         for (index_t mm = m; mm < out_channels; ++mm) {
           float *out_ptr0_base =
-              output + b * out_batch_size + mm * out_image_size;
+              output_data + b * out_batch_size + mm * out_image_size;
           for (index_t c = 0; c < in_channels; ++c) {
             const float *in_ptr_base =
-                input + b * in_batch_size + c * in_image_size;
-            const float *filter_ptr0 = filter + mm * in_channels * 49 + c * 49;
-#if defined(MACE_ENABLE_NEON)
+                input_data + b * in_batch_size + c * in_image_size;
+            const float
+                *filter_ptr0 = filter_data + mm * in_channels * 49 + c * 49;
             for (index_t h = 0; h < out_height; ++h) {
               for (index_t w = 0; w + 3 < out_width; w += 4) {
                 // input offset
@@ -462,23 +501,50 @@ void Conv2dNeonK7x7S2(const float *input,
                 filter_ptr0 -= 49;
               }  // w
             }    // h
-#else
-            Conv2dCPUKHxKWCalc(in_ptr_base, filter_ptr0, in_width, 7, 7,
-                               out_height, out_width, out_ptr0_base, 2);
-#endif
           }  // c
         }    // mm
       }      // if
     }        // m
   }          // b
+
+  UnPadOutput(*out_tensor, output);
+  return MaceStatus::MACE_SUCCESS;
 }
 
-// Ho = 1, Wo = 4, Co = 4
-void Conv2dNeonK7x7S3(const float *input,
-                      const float *filter,
-                      const index_t *in_shape,
-                      const index_t *out_shape,
-                      float *output) {
+MaceStatus Conv2dK7x7S3::Compute(const OpContext *context,
+                                 const Tensor *input,
+                                 const Tensor *filter,
+                                 Tensor *output) {
+  std::unique_ptr<const Tensor> padded_input;
+  std::unique_ptr<Tensor> padded_output;
+  ResizeOutAndPadInOut(context,
+                       input,
+                       filter,
+                       output,
+                       1,
+                       4,
+                       &padded_input,
+                       &padded_output);
+  const Tensor *in_tensor = input;
+  if (padded_input.get() != nullptr) {
+    in_tensor = padded_input.get();
+  }
+  Tensor *out_tensor = output;
+  if (padded_output.get() != nullptr) {
+    out_tensor = padded_output.get();
+  }
+  out_tensor->Clear();
+
+  Tensor::MappingGuard in_guard(input);
+  Tensor::MappingGuard filter_guard(filter);
+  Tensor::MappingGuard out_guard(output);
+  auto filter_data = filter->data<float>();
+  auto input_data = in_tensor->data<float>();
+  auto output_data = out_tensor->mutable_data<float>();
+
+  auto in_shape = in_tensor->shape();
+  auto out_shape = out_tensor->shape();
+
   const index_t in_image_size = in_shape[2] * in_shape[3];
   const index_t out_image_size = out_shape[2] * out_shape[3];
   const index_t in_batch_size = in_shape[1] * in_image_size;
@@ -493,26 +559,25 @@ void Conv2dNeonK7x7S3(const float *input,
       const index_t in_channels = in_shape[1];
       const index_t in_width = in_shape[3];
       if (m + 3 < out_channels) {
-        float *out_ptr0_base = output + b * out_batch_size + m * out_image_size;
-#if defined(MACE_ENABLE_NEON)
+        float *out_ptr0_base =
+            output_data + b * out_batch_size + m * out_image_size;
         float *out_ptr1_base =
-            output + b * out_batch_size + (m + 1) * out_image_size;
+            output_data + b * out_batch_size + (m + 1) * out_image_size;
         float *out_ptr2_base =
-            output + b * out_batch_size + (m + 2) * out_image_size;
+            output_data + b * out_batch_size + (m + 2) * out_image_size;
         float *out_ptr3_base =
-            output + b * out_batch_size + (m + 3) * out_image_size;
-#endif
+            output_data + b * out_batch_size + (m + 3) * out_image_size;
         for (index_t c = 0; c < in_channels; ++c) {
           const float *in_ptr_base =
-              input + b * in_batch_size + c * in_image_size;
-          const float *filter_ptr0 = filter + m * in_channels * 49 + c * 49;
-#if defined(MACE_ENABLE_NEON)
+              input_data + b * in_batch_size + c * in_image_size;
+          const float
+              *filter_ptr0 = filter_data + m * in_channels * 49 + c * 49;
           const float *filter_ptr1 =
-              filter + (m + 1) * in_channels * 49 + c * 49;
+              filter_data + (m + 1) * in_channels * 49 + c * 49;
           const float *filter_ptr2 =
-              filter + (m + 2) * in_channels * 49 + c * 49;
+              filter_data + (m + 2) * in_channels * 49 + c * 49;
           const float *filter_ptr3 =
-              filter + (m + 3) * in_channels * 49 + c * 49;
+              filter_data + (m + 3) * in_channels * 49 + c * 49;
           for (index_t h = 0; h < out_height; ++h) {
             for (index_t w = 0; w + 3 < out_width; w += 4) {
               // input offset
@@ -568,23 +633,16 @@ void Conv2dNeonK7x7S3(const float *input,
               filter_ptr3 -= 49;
             }  // w
           }    // h
-#else
-          for (index_t oc = 0; oc < 4; ++oc) {
-            Conv2dCPUKHxKWCalc(in_ptr_base, filter_ptr0 + oc * in_channels * 49,
-                               in_width, 7, 7, out_height, out_width,
-                               out_ptr0_base + oc * out_image_size, 3);
-          }
-#endif
         }  // c
       } else {
         for (index_t mm = m; mm < out_channels; ++mm) {
           float *out_ptr0_base =
-              output + b * out_batch_size + mm * out_image_size;
+              output_data + b * out_batch_size + mm * out_image_size;
           for (index_t c = 0; c < in_channels; ++c) {
             const float *in_ptr_base =
-                input + b * in_batch_size + c * in_image_size;
-            const float *filter_ptr0 = filter + mm * in_channels * 49 + c * 49;
-#if defined(MACE_ENABLE_NEON)
+                input_data + b * in_batch_size + c * in_image_size;
+            const float
+                *filter_ptr0 = filter_data + mm * in_channels * 49 + c * 49;
             for (index_t h = 0; h < out_height; ++h) {
               for (index_t w = 0; w + 3 < out_width; w += 4) {
                 // input offset
@@ -627,16 +685,17 @@ void Conv2dNeonK7x7S3(const float *input,
                 filter_ptr0 -= 49;
               }  // w
             }    // h
-#else
-            Conv2dCPUKHxKWCalc(in_ptr_base, filter_ptr0, in_width, 7, 7,
-                               out_height, out_width, out_ptr0_base, 3);
-#endif
           }  // c
         }    // mm
       }      // if
     }        // m
   }          // b
+
+  UnPadOutput(*out_tensor, output);
+  return MaceStatus::MACE_SUCCESS;
 }
 
+}  // namespace fp32
+}  // namespace arm
 }  // namespace ops
 }  // namespace mace
diff --git a/mace/ops/arm/fp32/conv_2d_7x7.h b/mace/ops/arm/fp32/conv_2d_7x7.h
new file mode 100644
index 00000000..e64780ba
--- /dev/null
+++ b/mace/ops/arm/fp32/conv_2d_7x7.h
@@ -0,0 +1,73 @@
+// Copyright 2019 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef MACE_OPS_ARM_FP32_CONV_2D_7X7_H_
+#define MACE_OPS_ARM_FP32_CONV_2D_7X7_H_
+
+#include <vector>
+#include "mace/public/mace.h"
+#include "mace/core/tensor.h"
+#include "mace/core/op_context.h"
+#include "mace/ops/arm/fp32/conv_2d.h"
+
+namespace mace {
+namespace ops {
+namespace arm {
+namespace fp32 {
+
+class Conv2dK7x7S1 : public Conv2dBase {
+ public:
+  Conv2dK7x7S1(const std::vector<int> paddings, const Padding padding_type)
+      : Conv2dBase({1, 1}, {1, 1}, paddings, padding_type) {}
+  virtual ~Conv2dK7x7S1() {}
+
+  MaceStatus Compute(
+      const OpContext *context,
+      const Tensor *input,
+      const Tensor *filter,
+      Tensor *output);
+};
+
+class Conv2dK7x7S2 : public Conv2dBase {
+ public:
+  Conv2dK7x7S2(const std::vector<int> paddings, const Padding padding_type)
+      : Conv2dBase({2, 2}, {1, 1}, paddings, padding_type) {}
+  virtual ~Conv2dK7x7S2() {}
+
+  MaceStatus Compute(
+      const OpContext *context,
+      const Tensor *input,
+      const Tensor *filter,
+      Tensor *output);
+};
+
+class Conv2dK7x7S3 : public Conv2dBase {
+ public:
+  Conv2dK7x7S3(const std::vector<int> paddings, const Padding padding_type)
+      : Conv2dBase({3, 3}, {1, 1}, paddings, padding_type) {}
+  virtual ~Conv2dK7x7S3() {}
+
+  MaceStatus Compute(
+      const OpContext *context,
+      const Tensor *input,
+      const Tensor *filter,
+      Tensor *output);
+};
+
+}  // namespace fp32
+}  // namespace arm
+}  // namespace ops
+}  // namespace mace
+
+#endif  // MACE_OPS_ARM_FP32_CONV_2D_7X7_H_
diff --git a/mace/ops/arm/fp32/conv_general.cc b/mace/ops/arm/fp32/conv_general.cc
new file mode 100644
index 00000000..a12c5d53
--- /dev/null
+++ b/mace/ops/arm/fp32/conv_general.cc
@@ -0,0 +1,232 @@
+// Copyright 2019 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <memory>
+#include "mace/ops/arm/fp32/conv_general.h"
+
+namespace mace {
+namespace ops {
+namespace arm {
+namespace fp32 {
+
+MaceStatus Conv2dGeneral::Compute(const OpContext *context,
+                                  const Tensor *input,
+                                  const Tensor *filter,
+                                  Tensor *output) {
+  std::unique_ptr<const Tensor> padded_input;
+  std::unique_ptr<Tensor> padded_output;
+
+  ResizeOutAndPadInOut(context,
+                       input,
+                       filter,
+                       output,
+                       1,
+                       4,
+                       &padded_input,
+                       &padded_output);
+
+  const Tensor *in_tensor = input;
+  if (padded_input.get() != nullptr) {
+    in_tensor = padded_input.get();
+  }
+  Tensor *out_tensor = output;
+  if (padded_output.get() != nullptr) {
+    out_tensor = padded_output.get();
+  }
+  out_tensor->Clear();
+
+  Tensor::MappingGuard in_guard(input);
+  Tensor::MappingGuard filter_guard(filter);
+  Tensor::MappingGuard out_guard(output);
+  auto filter_data = filter->data<float>();
+  auto input_data = in_tensor->data<float>();
+  auto output_data = out_tensor->mutable_data<float>();
+
+  auto in_shape = in_tensor->shape();
+  auto out_shape = out_tensor->shape();
+  auto filter_shape = filter->shape();
+
+  const index_t in_image_size = in_shape[2] * in_shape[3];
+  const index_t out_image_size = out_shape[2] * out_shape[3];
+  const index_t in_batch_size = filter_shape[1] * in_image_size;
+  const index_t out_batch_size = filter_shape[0] * out_image_size;
+  const index_t filter_size = filter_shape[2] * filter_shape[3];
+
+#pragma omp parallel for collapse(2) schedule(runtime)
+  for (index_t b = 0; b < in_shape[0]; b++) {
+    for (index_t m = 0; m < filter_shape[0]; m += 4) {
+      const index_t in_width = in_shape[3];
+      const index_t out_height = out_shape[2];
+      const index_t out_width = out_shape[3];
+      const index_t out_channels = filter_shape[0];
+      const index_t in_channels = filter_shape[1];
+
+      const int stride_h = strides_[0];
+      const int stride_w = strides_[1];
+      const int dilation_h = dilations_[0];
+      const int dilation_w = dilations_[1];
+      if (m + 3 < out_channels) {
+        float *out_ptr0_base =
+            output_data + b * out_batch_size + m * out_image_size;
+        float *out_ptr1_base = out_ptr0_base + out_image_size;
+        float *out_ptr2_base = out_ptr1_base + out_image_size;
+        float *out_ptr3_base = out_ptr2_base + out_image_size;
+        for (index_t c = 0; c < in_channels; ++c) {
+          const float *in_ptr_base =
+              input_data + b * in_batch_size + c * in_image_size;
+          const float *filter_ptr0 =
+              filter_data + m * in_channels * filter_size + c * filter_size;
+          const float *filter_ptr1 = filter_ptr0 + in_channels * filter_size;
+          const float *filter_ptr2 = filter_ptr1 + in_channels * filter_size;
+          const float *filter_ptr3 = filter_ptr2 + in_channels * filter_size;
+          for (index_t h = 0; h < out_height; ++h) {
+            for (index_t w = 0; w + 3 < out_width; w += 4) {
+              // input offset
+              index_t ih = h * stride_h;
+              index_t iw = w * stride_w;
+              index_t in_offset = ih * in_width + iw;
+              // output (4 outch x 1 height x 4 width): vo_outch_height
+              float vo0[4], vo1[4], vo2[4], vo3[4];
+              // load output
+              index_t out_offset = h * out_width + w;
+              for (index_t ow = 0; ow < 4; ++ow) {
+                vo0[ow] = out_ptr0_base[out_offset + ow];
+                vo1[ow] = out_ptr1_base[out_offset + ow];
+                vo2[ow] = out_ptr2_base[out_offset + ow];
+                vo3[ow] = out_ptr3_base[out_offset + ow];
+              }
+              // calc by row
+              for (index_t kh = 0; kh < filter_shape[2]; ++kh) {
+                for (index_t kw = 0; kw < filter_shape[3]; ++kw) {
+                  // outch 0
+                  vo0[0] += in_ptr_base[in_offset
+                      + kw * dilation_w] * filter_ptr0[kw];
+                  vo0[1] += in_ptr_base[in_offset + stride_w
+                      + kw * dilation_w] * filter_ptr0[kw];
+                  vo0[2] += in_ptr_base[in_offset + 2 * stride_w
+                      + kw * dilation_w] * filter_ptr0[kw];
+                  vo0[3] += in_ptr_base[in_offset + 3 * stride_w
+                      + kw * dilation_w] * filter_ptr0[kw];
+                  // outch 1
+                  vo1[0] += in_ptr_base[in_offset
+                      + kw * dilation_w] * filter_ptr1[kw];
+                  vo1[1] += in_ptr_base[in_offset + stride_w
+                      + kw * dilation_w] * filter_ptr1[kw];
+                  vo1[2] += in_ptr_base[in_offset + 2 * stride_w
+                      + kw * dilation_w] * filter_ptr1[kw];
+                  vo1[3] += in_ptr_base[in_offset + 3 * stride_w
+                      + kw * dilation_w] * filter_ptr1[kw];
+                  // outch 2
+                  vo2[0] += in_ptr_base[in_offset
+                      + kw * dilation_w] * filter_ptr2[kw];
+                  vo2[1] += in_ptr_base[in_offset + stride_w
+                      + kw * dilation_w] * filter_ptr2[kw];
+                  vo2[2] += in_ptr_base[in_offset + 2 * stride_w
+                      + kw * dilation_w] * filter_ptr2[kw];
+                  vo2[3] += in_ptr_base[in_offset + 3 * stride_w
+                      + kw * dilation_w] * filter_ptr2[kw];
+                  // outch 3
+                  vo3[0] += in_ptr_base[in_offset
+                      + kw * dilation_w] * filter_ptr3[kw];
+                  vo3[1] += in_ptr_base[in_offset + stride_w
+                      + kw * dilation_w] * filter_ptr3[kw];
+                  vo3[2] += in_ptr_base[in_offset + 2 * stride_w
+                      + kw * dilation_w] * filter_ptr3[kw];
+                  vo3[3] += in_ptr_base[in_offset + 3 * stride_w
+                      + kw * dilation_w] * filter_ptr3[kw];
+                }  // kw
+
+                in_offset += dilation_h * in_width;
+                filter_ptr0 += filter_shape[3];
+                filter_ptr1 += filter_shape[3];
+                filter_ptr2 += filter_shape[3];
+                filter_ptr3 += filter_shape[3];
+              }  // kh
+
+              for (index_t ow = 0; ow < 4; ++ow) {
+                out_ptr0_base[out_offset + ow] = vo0[ow];
+                out_ptr1_base[out_offset + ow] = vo1[ow];
+                out_ptr2_base[out_offset + ow] = vo2[ow];
+                out_ptr3_base[out_offset + ow] = vo3[ow];
+              }
+
+              filter_ptr0 -= filter_size;
+              filter_ptr1 -= filter_size;
+              filter_ptr2 -= filter_size;
+              filter_ptr3 -= filter_size;
+            }  // w
+          }  // h
+        }  // c
+      } else {
+        for (index_t mm = m; mm < out_channels; ++mm) {
+          float *out_ptr0_base =
+              output_data + b * out_batch_size + mm * out_image_size;
+          for (index_t c = 0; c < in_channels; ++c) {
+            const float *in_ptr_base =
+                input_data + b * in_batch_size + c * in_image_size;
+            const float *filter_ptr0 =
+                filter_data + mm * in_channels * filter_size + c * filter_size;
+
+            for (index_t h = 0; h < out_height; ++h) {
+              for (index_t w = 0; w + 3 < out_width; w += 4) {
+                // input offset
+                index_t ih = h * stride_h;
+                index_t iw = w * stride_w;
+                index_t in_offset = ih * in_width + iw;
+                // output (1 outch x 1 height x 4 width): vo_outch_height
+                float vo0[4];
+                // load output
+                index_t out_offset = h * out_width + w;
+                for (index_t ow = 0; ow < 4; ++ow) {
+                  vo0[ow] = out_ptr0_base[out_offset + ow];
+                }
+
+                // calc by row
+                for (index_t kh = 0; kh < filter_shape[2]; ++kh) {
+                  for (index_t kw = 0; kw < filter_shape[3]; ++kw) {
+                    // outch 0
+                    vo0[0] += in_ptr_base[in_offset
+                        + kw * dilation_w] * filter_ptr0[kw];
+                    vo0[1] += in_ptr_base[in_offset + stride_w
+                        + kw * dilation_w] * filter_ptr0[kw];
+                    vo0[2] += in_ptr_base[in_offset + 2 * stride_w
+                        + kw * dilation_w] * filter_ptr0[kw];
+                    vo0[3] += in_ptr_base[in_offset + 3 * stride_w
+                        + kw * dilation_w] * filter_ptr0[kw];
+                  }  // kw
+
+                  in_offset += dilation_h * in_width;
+                  filter_ptr0 += filter_shape[3];
+                }  // kh
+
+                for (index_t ow = 0; ow < 4; ++ow) {
+                  out_ptr0_base[out_offset + ow] = vo0[ow];
+                }
+                filter_ptr0 -= filter_size;
+              }  // w
+            }  // h
+          }  // c
+        }  // mm
+      }  // if
+    }  // m
+  }  // b
+
+  UnPadOutput(*out_tensor, output);
+  return MaceStatus::MACE_SUCCESS;
+}
+
+}  // namespace fp32
+}  // namespace arm
+}  // namespace ops
+}  // namespace mace
diff --git a/mace/ops/arm/fp32/conv_general.h b/mace/ops/arm/fp32/conv_general.h
new file mode 100644
index 00000000..01d01954
--- /dev/null
+++ b/mace/ops/arm/fp32/conv_general.h
@@ -0,0 +1,50 @@
+// Copyright 2019 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef MACE_OPS_ARM_FP32_CONV_GENERAL_H_
+#define MACE_OPS_ARM_FP32_CONV_GENERAL_H_
+
+#include <vector>
+#include "mace/public/mace.h"
+#include "mace/core/tensor.h"
+#include "mace/core/op_context.h"
+#include "mace/ops/arm/fp32/conv_2d.h"
+
+namespace mace {
+namespace ops {
+namespace arm {
+namespace fp32 {
+
+class Conv2dGeneral : public Conv2dBase {
+ public:
+  Conv2dGeneral(const std::vector<int> strides,
+                const std::vector<int> dilations,
+                const std::vector<int> paddings,
+                const Padding padding_type)
+      : Conv2dBase(strides, dilations, paddings, padding_type) {}
+  virtual ~Conv2dGeneral() {}
+
+  MaceStatus Compute(
+      const OpContext *context,
+      const Tensor *input,
+      const Tensor *filter,
+      Tensor *output);
+};
+
+}  // namespace fp32
+}  // namespace arm
+}  // namespace ops
+}  // namespace mace
+
+#endif  // MACE_OPS_ARM_FP32_CONV_GENERAL_H_
diff --git a/mace/ops/arm/fp32/gemm.h b/mace/ops/arm/fp32/gemm.h
index 36eb0378..ce226c1a 100644
--- a/mace/ops/arm/fp32/gemm.h
+++ b/mace/ops/arm/fp32/gemm.h
@@ -1,4 +1,4 @@
-// Copyright 2018 The MACE Authors. All Rights Reserved.
+// Copyright 2019 The MACE Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/mace/ops/arm/fp32/gemv.h b/mace/ops/arm/fp32/gemv.h
index 3210def1..1f406426 100644
--- a/mace/ops/arm/fp32/gemv.h
+++ b/mace/ops/arm/fp32/gemv.h
@@ -1,4 +1,4 @@
-// Copyright 2018 The MACE Authors. All Rights Reserved.
+// Copyright 2019 The MACE Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/mace/ops/arm/q8/eltwise.h b/mace/ops/arm/q8/eltwise.h
index 5223dc30..200b13cb 100644
--- a/mace/ops/arm/q8/eltwise.h
+++ b/mace/ops/arm/q8/eltwise.h
@@ -1,4 +1,4 @@
-// Copyright 2018 The MACE Authors. All Rights Reserved.
+// Copyright 2019 The MACE Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/mace/ops/arm/q8/gemv.h b/mace/ops/arm/q8/gemv.h
index 1734a956..21a27579 100644
--- a/mace/ops/arm/q8/gemv.h
+++ b/mace/ops/arm/q8/gemv.h
@@ -1,4 +1,4 @@
-// Copyright 2018 The MACE Authors. All Rights Reserved.
+// Copyright 2019 The MACE Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/mace/ops/conv_2d.cc b/mace/ops/conv_2d.cc
index 65a723ad..a6421f45 100644
--- a/mace/ops/conv_2d.cc
+++ b/mace/ops/conv_2d.cc
@@ -27,7 +27,6 @@
 #include "mace/core/operator.h"
 #include "mace/core/tensor.h"
 #include "mace/ops/activation.h"
-#include "mace/ops/arm/conv_2d_neon.h"
 #include "mace/ops/conv_pool_2d_base.h"
 #include "mace/ops/common/conv_pool_2d_util.h"
 #include "mace/utils/memory.h"
@@ -36,11 +35,16 @@
 #ifdef MACE_ENABLE_NEON
 #include "mace/ops/arm/fp32/conv_2d.h"
 #include "mace/ops/arm/fp32/conv_2d_1x1.h"
+#include "mace/ops/arm/fp32/conv_2d_3x3.h"
 #include "mace/ops/arm/fp32/conv_2d_3x3_winograd.h"
-#else
-#include "mace/ops/ref/conv_2d.h"
+#include "mace/ops/arm/fp32/conv_2d_5x5.h"
+#include "mace/ops/arm/fp32/conv_2d_7x7.h"
+#include "mace/ops/arm/fp32/conv_2d_1xn.h"
+#include "mace/ops/arm/fp32/conv_general.h"
 #endif  // MACE_ENABLE_NEON
 
+#include "mace/ops/ref/conv_2d.h"
+
 #ifdef MACE_ENABLE_QUANTIZE
 #include "mace/ops/common/gemmlowp_util.h"
 #include "mace/ops/quantization_util.h"
@@ -68,8 +72,7 @@ class Conv2dOp<DeviceType::CPU, float> : public ConvPool2dOpBase {
                                                    "NOOP"))),
         relux_max_limit_(Operation::GetOptionalArg<float>("max_limit", 0.0f)),
         leakyrelu_coefficient_(Operation::GetOptionalArg<float>(
-            "leakyrelu_coefficient", 0.0f)),
-        conv2d_delegator_(nullptr) {}
+            "leakyrelu_coefficient", 0.0f)) {}
 
   MaceStatus Run(OpContext *context) override {
     const Tensor *input = this->Input(INPUT);
@@ -77,342 +80,99 @@ class Conv2dOp<DeviceType::CPU, float> : public ConvPool2dOpBase {
     const Tensor *bias = this->InputSize() >= 3 ? this->Input(BIAS) : nullptr;
     Tensor *output = this->Output(OUTPUT);
 
-    index_t input_batch = input->dim(0);
-    index_t input_channels = input->dim(1);
-    std::vector<index_t> filter_shape(4);
-    filter_shape = filter->shape();
-
-    index_t stride_h = strides_[0];
-    index_t stride_w = strides_[1];
-
-    index_t dilation_h = dilations_[0];
-    index_t dilation_w = dilations_[1];
-
-    std::vector<index_t> output_shape(4);
-    std::vector<int> paddings(2);
-    if (paddings_.empty()) {
-      CalcNCHWPaddingAndOutputSize(input->shape().data(),
-                                   filter_shape.data(),
-                                   dilations_.data(),
-                                   strides_.data(),
-                                   padding_type_,
-                                   output_shape.data(),
-                                   paddings.data());
-    } else {
-      paddings = paddings_;
-      CalcNCHWOutputSize(input->shape().data(),
-                         filter_shape.data(),
-                         paddings_.data(),
-                         dilations_.data(),
-                         strides_.data(),
-                         RoundType::FLOOR,
-                         output_shape.data());
-    }
-    MACE_RETURN_IF_ERROR(output->Resize(output_shape));
-
-    index_t batch = output->dim(0);
-    index_t channels = output->dim(1);
-    index_t height = output->dim(2);
-    index_t width = output->dim(3);
-
-    MACE_CHECK(batch == input_batch, "Input/Output batch size mismatch");
-    MACE_CHECK(filter_shape[0] == channels, filter_shape[0], " != ", channels);
-    MACE_CHECK(filter_shape[1] == input_channels, filter_shape[1], " != ",
-               input_channels);
+    const index_t channels = filter->dim(0);
 
 #ifdef MACE_ENABLE_NEON
-    index_t input_height = input->dim(2);
-    index_t input_width = input->dim(3);
-    index_t filter_h = filter->dim(2);
-    index_t filter_w = filter->dim(3);
-
-    int pad_top = paddings[0] >> 1;
-    int pad_bottom = paddings[0] - pad_top;
-    int pad_left = paddings[1] >> 1;
-    int pad_right = paddings[1] - pad_left;
-
-    if (filter_h == 1 && filter_w == 1 && stride_h == 1 && stride_w == 1
-        && dilation_h == 1 && dilation_w == 1) {
-      if (conv2d_delegator_.get() == nullptr) {
-        conv2d_delegator_ = make_unique<arm::fp32::Conv2dK1x1>();
-      }
-      conv2d_delegator_->Compute(context, input, filter, output);
-    } else if (filter_h == 3 && filter_w == 3
-        && stride_h == 1 && stride_w == 1 && dilation_h == 1
-        && dilation_w == 1
-        && input_channels >= 8 && channels >= 8) {
-      if (conv2d_delegator_.get() == nullptr) {
-        conv2d_delegator_ = make_unique<arm::fp32::Conv2dK3x3Winograd>(
-            pad_top, pad_bottom, pad_left, pad_right);
-      }
-      conv2d_delegator_->Compute(context, input, filter, output);
-    } else {
-      // TODO(liyin): the code below needs to be refactored.
-      // delegate to each of kernels instead of ruling them all
-      index_t padded_input_height = input_height + paddings[0];
-      index_t padded_input_width = input_width + paddings[1];
-      index_t extra_input_height = padded_input_height;
-      index_t extra_input_width = padded_input_width;
-      index_t extra_output_height = height;
-      index_t extra_output_width = width;
-
-      int pad_top = paddings[0] >> 1;
-      int pad_bottom = paddings[0] - pad_top;
-      int pad_left = paddings[1] >> 1;
-      int pad_right = paddings[1] - pad_left;
-
-      Tensor::MappingGuard input_guard(input);
-      Tensor::MappingGuard filter_guard(filter);
-      Tensor::MappingGuard output_guard(output);
-
-      auto filter_data = filter->data<float>();
-      auto output_data = output->mutable_data<float>();
-
-      std::function<void(const float *input, float *output)> conv_func;
-
-      bool use_neon_3x3_s1 = filter_h == 3 && filter_w == 3
+    // the following params are used to decide which conv delegator to use
+    const index_t stride_h = strides_[0];
+    const index_t stride_w = strides_[1];
+    const index_t dilation_h = dilations_[0];
+    const index_t dilation_w = dilations_[1];
+    const index_t filter_h = filter->dim(2);
+    const index_t filter_w = filter->dim(3);
+    const index_t input_channels = input->dim(1);
+
+    // NOTE: delegator is fixed after first round of running,
+    // although winograd depends on input params.
+    // We do not support changeable filter for now.
+    if (conv2d_delegator_.get() == nullptr) {
+      if (filter_h == 1 && filter_w == 1 && stride_h == 1 && stride_w == 1
+          && dilation_h == 1 && dilation_w == 1) {
+        conv2d_delegator_ = make_unique<arm::fp32::Conv2dK1x1>(
+            paddings_, padding_type_);
+      } else if (filter_h == 3 && filter_w == 3
           && stride_h == 1 && stride_w == 1 && dilation_h == 1
-          && dilation_w == 1;
-      bool use_neon_3x3_s2 = filter_h == 3 && filter_w == 3
+          && dilation_w == 1) {
+        if (input_channels >= 8 && channels >= 8) {
+          conv2d_delegator_ = make_unique<arm::fp32::Conv2dK3x3Winograd>(
+              paddings_, padding_type_);
+        } else {
+          conv2d_delegator_ = make_unique<arm::fp32::Conv2dK3x3S1>(
+              paddings_, padding_type_);
+        }
+      } else if (filter_h == 3 && filter_w == 3
           && stride_h == 2 && stride_w == 2 && dilation_h == 1
-          && dilation_w == 1;
-      bool use_neon_5x5_s1 = filter_h == 5 && filter_w == 5
+          && dilation_w == 1) {
+        conv2d_delegator_ = make_unique<arm::fp32::Conv2dK3x3S2>(
+            paddings_, padding_type_);
+      } else if (filter_h == 5 && filter_w == 5
           && stride_h == 1 && stride_w == 1 && dilation_h == 1
-          && dilation_w == 1;
-      bool use_neon_1x7_s1 = filter_h == 1 && filter_w == 7
+          && dilation_w == 1) {
+        conv2d_delegator_ = make_unique<arm::fp32::Conv2dK5x5S1>(
+            paddings_, padding_type_);
+      } else if (filter_h == 7 && filter_w == 7
           && stride_h == 1 && stride_w == 1 && dilation_h == 1
-          && dilation_w == 1;
-      bool use_neon_7x1_s1 = filter_h == 7 && filter_w == 1
-          && stride_h == 1 && stride_w == 1 && dilation_h == 1
-          && dilation_w == 1;
-      bool use_neon_7x7_s1 = filter_h == 7 && filter_w == 7
-          && stride_h == 1 && stride_w == 1 && dilation_h == 1
-          && dilation_w == 1;
-      bool use_neon_7x7_s2 = filter_h == 7 && filter_w == 7
+          && dilation_w == 1) {
+        conv2d_delegator_ = make_unique<arm::fp32::Conv2dK7x7S1>(
+            paddings_, padding_type_);
+      } else if (filter_h == 7 && filter_w == 7
           && stride_h == 2 && stride_w == 2 && dilation_h == 1
-          && dilation_w == 1;
-      bool use_neon_7x7_s3 = filter_h == 7 && filter_w == 7
+          && dilation_w == 1) {
+        conv2d_delegator_ = make_unique<arm::fp32::Conv2dK7x7S2>(
+            paddings_, padding_type_);
+      } else if (filter_h == 7 && filter_w == 7
           && stride_h == 3 && stride_w == 3 && dilation_h == 1
-          && dilation_w == 1;
-      bool use_neon_1x15_s1 = filter_h == 1 && filter_w == 15
+          && dilation_w == 1) {
+        conv2d_delegator_ = make_unique<arm::fp32::Conv2dK7x7S3>(
+            paddings_, padding_type_);
+      } else if (filter_h == 1 && filter_w == 7
           && stride_h == 1 && stride_w == 1 && dilation_h == 1
-          && dilation_w == 1;
-      bool use_neon_15x1_s1 = filter_h == 15 && filter_w == 1
+          && dilation_w == 1) {
+        conv2d_delegator_ = make_unique<arm::fp32::Conv2dK1x7S1>(
+            paddings_, padding_type_);
+      } else if (filter_h == 7 && filter_w == 1
           && stride_h == 1 && stride_w == 1 && dilation_h == 1
-          && dilation_w == 1;
-
-      index_t tile_h, tile_w;
-      if (use_neon_3x3_s1) {
-        tile_h = 2;
-        tile_w = 4;
-      } else if (use_neon_7x1_s1 || use_neon_15x1_s1) {
-        tile_h = 4;
-        tile_w = 1;
-      } else {
-        tile_h = 1;
-        tile_w = 4;
-      }
-      extra_output_height = RoundUp<index_t>(height, tile_h);
-      extra_input_height =
-          std::max(padded_input_height, (extra_output_height - 1) * stride_h
-              + (filter_h - 1) * dilation_h + 1);
-      extra_output_width = RoundUp<index_t>(width, tile_w);
-      extra_input_width =
-          std::max(padded_input_width, (extra_output_width - 1) * stride_w
-              + (filter_w - 1) * dilation_w + 1);
-      if (extra_input_height != padded_input_height) {
-        pad_bottom += (extra_input_height - padded_input_height);
-      }
-      if (extra_input_width != padded_input_width) {
-        pad_right += (extra_input_width - padded_input_width);
-      }
-
-      // decide scratch size before allocate it
-      index_t total_scratch_size = 0;
-      index_t padded_input_size = 0;
-      index_t padded_output_size = 0;
-
-      if (extra_input_height != input_height
-          || extra_input_width != input_width) {
-        padded_input_size =
-            PadAlignSize(
-                batch * input_channels * (input_height + pad_top + pad_bottom)
-                    * (input_width + pad_left + pad_right) * sizeof(float) +
-                    MACE_EXTRA_BUFFER_PAD_SIZE);
-        total_scratch_size += padded_input_size;
-      }
-      if (extra_output_height != height || extra_output_width != width) {
-        padded_output_size =
-            PadAlignSize(
-                batch * channels * extra_output_height * extra_output_width
-                    * sizeof(float) + MACE_EXTRA_BUFFER_PAD_SIZE);
-        total_scratch_size += padded_output_size;
-      }
-
-      // Init scratch buffer
-      ScratchBuffer *scratch = context->device()->scratch_buffer();
-      scratch->Rewind();
-      scratch->GrowSize(total_scratch_size);
-      Tensor padded_input(scratch->Scratch(padded_input_size), DT_FLOAT);
-      Tensor padded_output(scratch->Scratch(padded_output_size), DT_FLOAT);
-      const index_t extra_input_shape[4] =
-          {batch, input_channels, extra_input_height, extra_input_width};
-      const index_t extra_output_shape[4] =
-          {batch, channels, extra_output_height, extra_output_width};
-
-      // make host compiler happy
-      MACE_UNUSED(extra_input_shape);
-      MACE_UNUSED(extra_output_shape);
-
-      // decide which convolution function to call
-      if (use_neon_3x3_s1) {
-        conv_func = [=](const float *pad_input, float *pad_output) {
-          Conv2dNeonK3x3S1(pad_input,
-                           filter_data,
-                           extra_input_shape,
-                           extra_output_shape,
-                           pad_output);
-        };
-      } else if (use_neon_3x3_s2) {
-        conv_func = [=](const float *pad_input, float *pad_output) {
-          Conv2dNeonK3x3S2(pad_input,
-                           filter_data,
-                           extra_input_shape,
-                           extra_output_shape,
-                           pad_output);
-        };
-      } else if (use_neon_5x5_s1) {
-        conv_func = [=](const float *pad_input, float *pad_output) {
-          Conv2dNeonK5x5S1(pad_input,
-                           filter_data,
-                           extra_input_shape,
-                           extra_output_shape,
-                           pad_output);
-        };
-      } else if (use_neon_1x7_s1) {
-        conv_func = [=](const float *pad_input, float *pad_output) {
-          Conv2dNeonK1x7S1(pad_input,
-                           filter_data,
-                           extra_input_shape,
-                           extra_output_shape,
-                           pad_output);
-        };
-      } else if (use_neon_7x1_s1) {
-        conv_func = [=](const float *pad_input, float *pad_output) {
-          Conv2dNeonK7x1S1(pad_input,
-                           filter_data,
-                           extra_input_shape,
-                           extra_output_shape,
-                           pad_output);
-        };
-      } else if (use_neon_7x7_s1) {
-        conv_func = [=](const float *pad_input, float *pad_output) {
-          Conv2dNeonK7x7S1(pad_input,
-                           filter_data,
-                           extra_input_shape,
-                           extra_output_shape,
-                           pad_output);
-        };
-      } else if (use_neon_7x7_s2) {
-        conv_func = [=](const float *pad_input, float *pad_output) {
-          Conv2dNeonK7x7S2(pad_input,
-                           filter_data,
-                           extra_input_shape,
-                           extra_output_shape,
-                           pad_output);
-        };
-      } else if (use_neon_7x7_s3) {
-        conv_func = [=](const float *pad_input, float *pad_output) {
-          Conv2dNeonK7x7S3(pad_input,
-                           filter_data,
-                           extra_input_shape,
-                           extra_output_shape,
-                           pad_output);
-        };
-      } else if (use_neon_1x15_s1) {
-        conv_func = [=](const float *pad_input, float *pad_output) {
-          Conv2dNeonK1x15S1(pad_input,
-                            filter_data,
-                            extra_input_shape,
-                            extra_output_shape,
-                            pad_output);
-        };
-      } else if (use_neon_15x1_s1) {
-        conv_func = [=](const float *pad_input, float *pad_output) {
-          Conv2dNeonK15x1S1(pad_input,
-                            filter_data,
-                            extra_input_shape,
-                            extra_output_shape,
-                            pad_output);
-        };
-      } else {
-        conv_func = [=](const float *pad_input, float *pad_output) {
-          Conv2dGeneral(pad_input,
-                        filter_data,
-                        extra_input_shape,
-                        extra_output_shape,
-                        filter_shape.data(),
-                        strides_.data(),
-                        dilations_.data(),
-                        pad_output);
-        };
-      }
-
-      // pad input and output
-      const Tensor *pad_input_ptr = input;
-      if (extra_input_height != input_height
-          || extra_input_width != input_width) {
-        MACE_RETURN_IF_ERROR(ConstructNCHWInputWithSpecificPadding(
-            input, pad_top, pad_bottom, pad_left, pad_right, &padded_input));
-        pad_input_ptr = &padded_input;
-      }
-
-      // TODO(libin): don't need clear after bias is integrated in each conv
-      Tensor *pad_output_ptr = output;
-      if (extra_output_height != height || extra_output_width != width) {
-        padded_output.Reshape({batch, channels, extra_output_height,
-                               extra_output_width});
-        padded_output.Clear();
-        pad_output_ptr = &padded_output;
+          && dilation_w == 1) {
+        conv2d_delegator_ = make_unique<arm::fp32::Conv2dK7x1S1>(
+            paddings_, padding_type_);
+      } else if (filter_h == 1 && filter_w == 15
+          && stride_h == 1 && stride_w == 1 && dilation_h == 1
+          && dilation_w == 1) {
+        conv2d_delegator_ = make_unique<arm::fp32::Conv2dK1x15S1>(
+            paddings_, padding_type_);
+      } else if (filter_h == 15 && filter_w == 1
+          && stride_h == 1 && stride_w == 1 && dilation_h == 1
+          && dilation_w == 1) {
+        conv2d_delegator_ = make_unique<arm::fp32::Conv2dK15x1S1>(
+            paddings_, padding_type_);
       } else {
-        output->Clear();
-      }
-
-      const float *pad_input_data = pad_input_ptr->data<float>();
-      float *pad_output_data = pad_output_ptr->mutable_data<float>();
-
-      conv_func(pad_input_data, pad_output_data);
-
-      // unpack output
-      if (extra_output_height != height || extra_output_width != width) {
-#pragma omp parallel for collapse(2) schedule(runtime)
-        for (index_t b = 0; b < batch; ++b) {
-          for (index_t c = 0; c < channels; ++c) {
-            for (index_t h = 0; h < height; ++h) {
-              memcpy(
-                  output_data + b * channels * height * width
-                      + c * height * width
-                      + h * width,
-                  pad_output_data
-                      + b * channels * extra_output_height * extra_output_width
-                      + c * extra_output_height * extra_output_width
-                      + h * extra_output_width,
-                  sizeof(float) * width);
-            }
-          }
-        }
+        conv2d_delegator_ = make_unique<arm::fp32::Conv2dGeneral>(
+            strides_,
+            dilations_,
+            paddings_,
+            padding_type_);
       }
     }
+
+    conv2d_delegator_->Compute(context, input, filter, output);
 #else
-    if (conv2d_delegator_.get() == nullptr) {
-      conv2d_delegator_ = make_unique<ref::Conv2d<float>>(paddings[0],
-                                                          paddings[1],
-                                                          stride_h,
-                                                          stride_w,
-                                                          dilation_h,
-                                                          dilation_w);
+    if (ref_conv2d_delegator_.get() == nullptr) {
+      ref_conv2d_delegator_ = make_unique<ref::Conv2d<float>>(strides_,
+                                                              dilations_,
+                                                              paddings_,
+                                                              padding_type_);
     }
-    conv2d_delegator_->Compute(context, input, filter, output);
+    ref_conv2d_delegator_->Compute(context, input, filter, output);
 #endif
 
     Tensor::MappingGuard bias_guard(bias);
@@ -420,6 +180,9 @@ class Conv2dOp<DeviceType::CPU, float> : public ConvPool2dOpBase {
     auto bias_data = bias == nullptr ? nullptr : bias->data<float>();
     auto output_data = output->mutable_data<float>();
     if (bias_data != nullptr) {
+      const index_t batch = input->dim(0);
+      const index_t height = output->dim(2);
+      const index_t width = output->dim(3);
       const index_t image_size = height * width;
 #pragma omp parallel for collapse(2) schedule(runtime)
       for (index_t b = 0; b < batch; ++b) {
@@ -452,186 +215,13 @@ class Conv2dOp<DeviceType::CPU, float> : public ConvPool2dOpBase {
   }
 
  private:
-  void Conv2dGeneral(const float *input,
-                     const float *filter,
-                     const index_t *in_shape,
-                     const index_t *out_shape,
-                     const index_t *filter_shape,
-                     const int *stride_hw,
-                     const int *dilation_hw,
-                     float *output) {
-    const index_t in_image_size = in_shape[2] * in_shape[3];
-    const index_t out_image_size = out_shape[2] * out_shape[3];
-    const index_t in_batch_size = filter_shape[1] * in_image_size;
-    const index_t out_batch_size = filter_shape[0] * out_image_size;
-    const index_t filter_size = filter_shape[2] * filter_shape[3];
-
-#pragma omp parallel for collapse(2) schedule(runtime)
-    for (index_t b = 0; b < in_shape[0]; b++) {
-      for (index_t m = 0; m < filter_shape[0]; m += 4) {
-        const index_t in_width = in_shape[3];
-        const index_t out_height = out_shape[2];
-        const index_t out_width = out_shape[3];
-        const index_t out_channels = filter_shape[0];
-        const index_t in_channels = filter_shape[1];
-
-        const int stride_h = stride_hw[0];
-        const int stride_w = stride_hw[1];
-        const int dilation_h = dilation_hw[0];
-        const int dilation_w = dilation_hw[1];
-        if (m + 3 < out_channels) {
-          float *out_ptr0_base =
-              output + b * out_batch_size + m * out_image_size;
-          float *out_ptr1_base = out_ptr0_base + out_image_size;
-          float *out_ptr2_base = out_ptr1_base + out_image_size;
-          float *out_ptr3_base = out_ptr2_base + out_image_size;
-          for (index_t c = 0; c < in_channels; ++c) {
-            const float *in_ptr_base =
-                input + b * in_batch_size + c * in_image_size;
-            const float *filter_ptr0 =
-                filter + m * in_channels * filter_size + c * filter_size;
-            const float *filter_ptr1 = filter_ptr0 + in_channels * filter_size;
-            const float *filter_ptr2 = filter_ptr1 + in_channels * filter_size;
-            const float *filter_ptr3 = filter_ptr2 + in_channels * filter_size;
-            for (index_t h = 0; h < out_height; ++h) {
-              for (index_t w = 0; w + 3 < out_width; w += 4) {
-                // input offset
-                index_t ih = h * stride_h;
-                index_t iw = w * stride_w;
-                index_t in_offset = ih * in_width + iw;
-                // output (4 outch x 1 height x 4 width): vo_outch_height
-                float vo0[4], vo1[4], vo2[4], vo3[4];
-                // load output
-                index_t out_offset = h * out_width + w;
-                for (index_t ow = 0; ow < 4; ++ow) {
-                  vo0[ow] = out_ptr0_base[out_offset + ow];
-                  vo1[ow] = out_ptr1_base[out_offset + ow];
-                  vo2[ow] = out_ptr2_base[out_offset + ow];
-                  vo3[ow] = out_ptr3_base[out_offset + ow];
-                }
-                // calc by row
-                for (index_t kh = 0; kh < filter_shape[2]; ++kh) {
-                  for (index_t kw = 0; kw < filter_shape[3]; ++kw) {
-                    // outch 0
-                    vo0[0] += in_ptr_base[in_offset
-                        + kw * dilation_w] * filter_ptr0[kw];
-                    vo0[1] += in_ptr_base[in_offset + stride_w
-                        + kw * dilation_w] * filter_ptr0[kw];
-                    vo0[2] += in_ptr_base[in_offset + 2 * stride_w
-                        + kw * dilation_w] * filter_ptr0[kw];
-                    vo0[3] += in_ptr_base[in_offset + 3 * stride_w
-                        + kw * dilation_w] * filter_ptr0[kw];
-                    // outch 1
-                    vo1[0] += in_ptr_base[in_offset
-                        + kw * dilation_w] * filter_ptr1[kw];
-                    vo1[1] += in_ptr_base[in_offset + stride_w
-                        + kw * dilation_w] * filter_ptr1[kw];
-                    vo1[2] += in_ptr_base[in_offset + 2 * stride_w
-                        + kw * dilation_w] * filter_ptr1[kw];
-                    vo1[3] += in_ptr_base[in_offset + 3 * stride_w
-                        + kw * dilation_w] * filter_ptr1[kw];
-                    // outch 2
-                    vo2[0] += in_ptr_base[in_offset
-                        + kw * dilation_w] * filter_ptr2[kw];
-                    vo2[1] += in_ptr_base[in_offset + stride_w
-                        + kw * dilation_w] * filter_ptr2[kw];
-                    vo2[2] += in_ptr_base[in_offset + 2 * stride_w
-                        + kw * dilation_w] * filter_ptr2[kw];
-                    vo2[3] += in_ptr_base[in_offset + 3 * stride_w
-                        + kw * dilation_w] * filter_ptr2[kw];
-                    // outch 3
-                    vo3[0] += in_ptr_base[in_offset
-                        + kw * dilation_w] * filter_ptr3[kw];
-                    vo3[1] += in_ptr_base[in_offset + stride_w
-                        + kw * dilation_w] * filter_ptr3[kw];
-                    vo3[2] += in_ptr_base[in_offset + 2 * stride_w
-                        + kw * dilation_w] * filter_ptr3[kw];
-                    vo3[3] += in_ptr_base[in_offset + 3 * stride_w
-                        + kw * dilation_w] * filter_ptr3[kw];
-                  }  // kw
-
-                  in_offset += dilation_h * in_width;
-                  filter_ptr0 += filter_shape[3];
-                  filter_ptr1 += filter_shape[3];
-                  filter_ptr2 += filter_shape[3];
-                  filter_ptr3 += filter_shape[3];
-                }  // kh
-
-                for (index_t ow = 0; ow < 4; ++ow) {
-                  out_ptr0_base[out_offset + ow] = vo0[ow];
-                  out_ptr1_base[out_offset + ow] = vo1[ow];
-                  out_ptr2_base[out_offset + ow] = vo2[ow];
-                  out_ptr3_base[out_offset + ow] = vo3[ow];
-                }
-
-                filter_ptr0 -= filter_size;
-                filter_ptr1 -= filter_size;
-                filter_ptr2 -= filter_size;
-                filter_ptr3 -= filter_size;
-              }  // w
-            }  // h
-          }  // c
-        } else {
-          for (index_t mm = m; mm < out_channels; ++mm) {
-            float *out_ptr0_base =
-                output + b * out_batch_size + mm * out_image_size;
-            for (index_t c = 0; c < in_channels; ++c) {
-              const float *in_ptr_base =
-                  input + b * in_batch_size + c * in_image_size;
-              const float *filter_ptr0 =
-                  filter + mm * in_channels * filter_size + c * filter_size;
-
-              for (index_t h = 0; h < out_height; ++h) {
-                for (index_t w = 0; w + 3 < out_width; w += 4) {
-                  // input offset
-                  index_t ih = h * stride_h;
-                  index_t iw = w * stride_w;
-                  index_t in_offset = ih * in_width + iw;
-                  // output (1 outch x 1 height x 4 width): vo_outch_height
-                  float vo0[4];
-                  // load output
-                  index_t out_offset = h * out_width + w;
-                  for (index_t ow = 0; ow < 4; ++ow) {
-                    vo0[ow] = out_ptr0_base[out_offset + ow];
-                  }
-
-                  // calc by row
-                  for (index_t kh = 0; kh < filter_shape[2]; ++kh) {
-                    for (index_t kw = 0; kw < filter_shape[3]; ++kw) {
-                      // outch 0
-                      vo0[0] += in_ptr_base[in_offset
-                          + kw * dilation_w] * filter_ptr0[kw];
-                      vo0[1] += in_ptr_base[in_offset + stride_w
-                          + kw * dilation_w] * filter_ptr0[kw];
-                      vo0[2] += in_ptr_base[in_offset + 2 * stride_w
-                          + kw * dilation_w] * filter_ptr0[kw];
-                      vo0[3] += in_ptr_base[in_offset + 3 * stride_w
-                          + kw * dilation_w] * filter_ptr0[kw];
-                    }  // kw
-
-                    in_offset += dilation_h * in_width;
-                    filter_ptr0 += filter_shape[3];
-                  }  // kh
-
-                  for (index_t ow = 0; ow < 4; ++ow) {
-                    out_ptr0_base[out_offset + ow] = vo0[ow];
-                  }
-                  filter_ptr0 -= filter_size;
-                }  // w
-              }  // h
-            }  // c
-          }  // mm
-        }  // if
-      }  // m
-    }  // b
-  }
   const ActivationType activation_;
   const float relux_max_limit_;
   const float leakyrelu_coefficient_;
 #ifdef MACE_ENABLE_NEON
   std::unique_ptr<arm::fp32::Conv2dBase> conv2d_delegator_;
 #else
-  std::unique_ptr<ref::Conv2d<float>> conv2d_delegator_;
+  std::unique_ptr<ref::Conv2d<float>> ref_conv2d_delegator_;
 #endif  // MACE_ENABLE_NEON
 
  private:
diff --git a/mace/ops/ref/conv_2d.cc b/mace/ops/ref/conv_2d.cc
index 4707d922..e5b7952a 100644
--- a/mace/ops/ref/conv_2d.cc
+++ b/mace/ops/ref/conv_2d.cc
@@ -16,7 +16,6 @@
 #include "mace/ops/ref/conv_2d.h"
 
 #include <vector>
-#include "mace/ops/common/conv_pool_2d_util.h"
 
 namespace mace {
 namespace ops {
@@ -30,31 +29,36 @@ MaceStatus Conv2d<float>::Compute(const OpContext *context,
 
   const std::vector<index_t> in_shape = input->shape();
   const std::vector<index_t> filter_shape = filter->shape();
-  const std::vector<index_t> out_shape = output->shape();
-  const std::vector<int> stride_hw{stride_h_, stride_w_};
-  const std::vector<int> dilation_hw{dilation_h_, dilation_w_};
-  const std::vector<int> paddings{pad_h_, pad_w_};
-  const index_t pad_top = pad_h_ >> 1;
-  const index_t pad_left = pad_w_ >> 1;
-
-  std::vector<index_t> output_shape(4);
-
-  CalcOutputSize(in_shape.data(),
-                 NCHW,
-                 filter_shape.data(),
-                 OIHW,
-                 paddings.data(),
-                 dilation_hw.data(),
-                 stride_hw.data(),
-                 RoundType::FLOOR,
-                 output_shape.data());
-  output->Resize(output_shape);
-
+  std::vector<index_t> out_shape(4);
+
+  std::vector<int> paddings(2);
+  if (paddings_.empty()) {
+    CalcNCHWPaddingAndOutputSize(input->shape().data(),
+                                 filter->shape().data(),
+                                 dilations_.data(),
+                                 strides_.data(),
+                                 padding_type_,
+                                 out_shape.data(),
+                                 paddings.data());
+  } else {
+    paddings = paddings_;
+    CalcNCHWOutputSize(input->shape().data(),
+                       filter->shape().data(),
+                       paddings_.data(),
+                       dilations_.data(),
+                       strides_.data(),
+                       RoundType::FLOOR,
+                       out_shape.data());
+  }
+  const index_t pad_top = paddings[0] >> 1;
+  const index_t pad_left = paddings[1] >> 1;
+  output->Resize(out_shape);
   const index_t in_image_size = in_shape[2] * in_shape[3];
   const index_t out_image_size = out_shape[2] * out_shape[3];
   const index_t in_batch_size = filter_shape[1] * in_image_size;
   const index_t out_batch_size = filter_shape[0] * out_image_size;
   const index_t filter_size = filter_shape[2] * filter_shape[3];
+
   Tensor::MappingGuard input_guard(input);
   Tensor::MappingGuard filter_guard(filter);
   Tensor::MappingGuard output_guard(output);
@@ -86,8 +90,10 @@ MaceStatus Conv2d<float>::Compute(const OpContext *context,
 
             for (index_t kh = 0; kh < filter_shape[2]; ++kh) {
               for (index_t kw = 0; kw < filter_shape[3]; ++kw) {
-                const index_t ih = -pad_top + h * stride_h_ + kh * dilation_h_;
-                const index_t iw = -pad_left + w * stride_w_ + kw * dilation_w_;
+                const index_t
+                    ih = -pad_top + h * strides_[0] + kh * dilations_[0];
+                const index_t
+                    iw = -pad_left + w * strides_[1] + kw * dilations_[1];
                 if (ih >= 0 && ih < in_height && iw >= 0 && iw < in_width) {
                   sum += in_ptr_base[ih * in_width + iw] * filter_ptr[kw];
                 }
diff --git a/mace/ops/ref/conv_2d.h b/mace/ops/ref/conv_2d.h
index e99af5cf..10baac8c 100644
--- a/mace/ops/ref/conv_2d.h
+++ b/mace/ops/ref/conv_2d.h
@@ -16,9 +16,12 @@
 #ifndef MACE_OPS_REF_CONV_2D_H_
 #define MACE_OPS_REF_CONV_2D_H_
 
+#include <vector>
+
 #include "mace/public/mace.h"
 #include "mace/core/tensor.h"
 #include "mace/core/op_context.h"
+#include "mace/ops/common/conv_pool_2d_util.h"
 
 namespace mace {
 namespace ops {
@@ -27,30 +30,39 @@ namespace ref {
 template<typename OUTPUT_TYPE>
 class Conv2d {
  public:
-  Conv2d(int stride_h, int stride_w, int dilation_h, int dilation_w);
+  Conv2d(const std::vector<int> strides,
+         const std::vector<int> dilations,
+         const std::vector<int> paddings,
+         const Padding padding_type)
+      : strides_(strides),
+        dilations_(dilations),
+        paddings_(paddings),
+        padding_type_(padding_type) {}
   ~Conv2d() {}
   MaceStatus Compute(
       const OpContext *context,
       const Tensor *input,
       const Tensor *filter,
       Tensor *output);
+
+ private:
+  const std::vector<int> strides_;
+  const std::vector<int> dilations_;
+  const std::vector<int> paddings_;
+  const Padding padding_type_;
 };
 
 template<>
 class Conv2d<float> {
  public:
-  Conv2d(int pad_h,
-         int pad_w,
-         int stride_h,
-         int stride_w,
-         int dilation_h,
-         int dilation_w)
-      : pad_h_(pad_h),
-        pad_w_(pad_w),
-        stride_h_(stride_h),
-        stride_w_(stride_w),
-        dilation_h_(dilation_h),
-        dilation_w_(dilation_w) {}
+  Conv2d(const std::vector<int> strides,
+         const std::vector<int> dilations,
+         const std::vector<int> paddings,
+         const Padding padding_type)
+      : strides_(strides),
+        dilations_(dilations),
+        paddings_(paddings),
+        padding_type_(padding_type) {}
   ~Conv2d() {}
   // Always row-major after transpose
   MaceStatus Compute(
@@ -60,12 +72,10 @@ class Conv2d<float> {
       Tensor *output);
 
  private:
-  int pad_h_;
-  int pad_w_;
-  int stride_h_;
-  int stride_w_;
-  int dilation_h_;
-  int dilation_w_;
+  const std::vector<int> strides_;
+  const std::vector<int> dilations_;
+  const std::vector<int> paddings_;
+  const Padding padding_type_;
 };
 
 }  // namespace ref
diff --git a/mace/python/tools/converter_tool/transformer.py b/mace/python/tools/converter_tool/transformer.py
index 256a397c..4cee2aab 100644
--- a/mace/python/tools/converter_tool/transformer.py
+++ b/mace/python/tools/converter_tool/transformer.py
@@ -1954,7 +1954,8 @@ class Transformer(base_converter.ConverterInterface):
             else:
                 print("Quantize op %s (%s)" % (op.name, op.type))
 
-            non_zero = self._option.device == DeviceType.CPU.value
+            non_zero = self._option.device == DeviceType.CPU.value \
+                and op.type == MaceOp.MatMul.name
 
             for idx, input_tensor in enumerate(op.input):
                 quantized_inputs_names.append(input_tensor)
-- 
GitLab