Merge branch 'fix_conv' into 'master'

Support padding for 1x1 See merge request !1023

Merge branch 'fix_conv' into 'master'
Support padding for 1x1 See merge request !1023
d66c971c · 李滨 · 91bd5c11 · e477fde9 · d66c971c · d66c971c
41 changed file
--- a/mace/core/buffer.h
+++ b/mace/core/buffer.h
@@ -503,7 +503,7 @@ class ScratchBuffer: public Buffer {
  virtual ~ScratchBuffer() {}

  MaceStatus GrowSize(const index_t size) {
-    if (size > size_) {
+    if (offset_ + size > size_) {
      VLOG(1) << "Grow scratch size to: " << size;
      MACE_CHECK(offset_ == 0, "scratch is being used, cannot grow size");
      return Resize(size);

--- a/mace/ops/arm/activation_neon.cc
+++ b/mace/ops/arm/activation_neon.cc
-// Copyright 2018 The MACE Authors. All Rights Reserved.
+// Copyright 2019 The MACE Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.

--- a/mace/ops/arm/activation_neon.h
+++ b/mace/ops/arm/activation_neon.h
-// Copyright 2018 The MACE Authors. All Rights Reserved.
+// Copyright 2019 The MACE Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.

--- a/mace/ops/arm/common_neon.h
+++ b/mace/ops/arm/common_neon.h
-// Copyright 2018 The MACE Authors. All Rights Reserved.
+// Copyright 2019 The MACE Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.

--- a/mace/ops/arm/conv_2d_neon.h
+++ b/mace/ops/arm/conv_2d_neon.h
-// Copyright 2018 The MACE Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef MACE_OPS_ARM_CONV_2D_NEON_H_
-#define MACE_OPS_ARM_CONV_2D_NEON_H_
-
-#include "mace/core/types.h"
-
-namespace mace {
-namespace ops {
-
-void Conv2dNeonK3x3S1(const float *input,
-                      const float *filter,
-                      const index_t *in_shape,
-                      const index_t *out_shape,
-                      float *output);
-
-void Conv2dNeonK3x3S2(const float *input,
-                      const float *filter,
-                      const index_t *in_shape,
-                      const index_t *out_shape,
-                      float *output);
-
-void Conv2dNeonK5x5S1(const float *input,
-                      const float *filter,
-                      const index_t *in_shape,
-                      const index_t *out_shape,
-                      float *output);
-
-void Conv2dNeonK1x7S1(const float *input,
-                      const float *filter,
-                      const index_t *in_shape,
-                      const index_t *out_shape,
-                      float *output);
-
-void Conv2dNeonK7x1S1(const float *input,
-                      const float *filter,
-                      const index_t *in_shape,
-                      const index_t *out_shape,
-                      float *output);
-
-void Conv2dNeonK7x7S1(const float *input,
-                      const float *filter,
-                      const index_t *in_shape,
-                      const index_t *out_shape,
-                      float *output);
-
-void Conv2dNeonK7x7S2(const float *input,
-                      const float *filter,
-                      const index_t *in_shape,
-                      const index_t *out_shape,
-                      float *output);
-
-void Conv2dNeonK7x7S3(const float *input,
-                      const float *filter,
-                      const index_t *in_shape,
-                      const index_t *out_shape,
-                      float *output);
-
-void Conv2dNeonK1x15S1(const float *input,
-                       const float *filter,
-                       const index_t *in_shape,
-                       const index_t *out_shape,
-                       float *output);
-
-void Conv2dNeonK15x1S1(const float *input,
-                       const float *filter,
-                       const index_t *in_shape,
-                       const index_t *out_shape,
-                       float *output);
-
-// calculate one output channel and one input channel
-inline void Conv2dCPUKHxKWCalc(const float *in_ptr,
-                               const float *filter_ptr,
-                               const index_t in_width,
-                               const index_t filter_height,
-                               const index_t filter_width,
-                               const index_t out_height,
-                               const index_t out_width,
-                               float *out_ptr,
-                               const int stride) {
-  for (index_t h = 0; h < out_height; ++h) {
-    for (index_t w = 0; w < out_width; ++w) {
-      for (int i = 0; i < filter_height; ++i) {
-        for (int j = 0; j < filter_width; ++j) {
-          out_ptr[h * out_width + w] +=
-              in_ptr[(h * stride + i) * in_width + (w * stride + j)] *
-              filter_ptr[i * filter_width + j];
-        }
-      }
-    }
-  }
-}
-
-}  // namespace ops
-}  // namespace mace
-
-#endif  // MACE_OPS_ARM_CONV_2D_NEON_H_
--- a/mace/ops/arm/conv_2d_neon_15x1.cc
+++ b/mace/ops/arm/conv_2d_neon_15x1.cc
-// Copyright 2018 The MACE Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#if defined(MACE_ENABLE_NEON)
-#include <arm_neon.h>
-#endif
-
-#include "mace/ops/arm/conv_2d_neon.h"
-#include "mace/utils/math.h"
-
-namespace mace {
-namespace ops {
-
-inline void Conv2dCPUK15x1Calc(const float *in_ptr,
-                               const float *filter_ptr,
-                               const index_t in_width,
-                               const index_t in_channels,
-                               const index_t out_height,
-                               const index_t out_width,
-                               const index_t w,
-                               const index_t tile_width,
-                               const index_t out_image_size,
-                               float *out_ptr,
-                               const index_t io,
-                               const int stride) {
-  for (index_t ih = 0; ih < out_height; ++ih) {
-    for (index_t iw = 0; iw < tile_width && w + iw < out_width; ++iw) {
-      for (int i = 0; i < 15; ++i) {
-        for (int j = 0; j < 1; ++j) {
-          out_ptr[io * out_image_size + ih * out_width + w + iw] +=
-              in_ptr[(ih * stride + i) * in_width + ((w + iw) * stride + j)] *
-              filter_ptr[io * in_channels * 15 + i * 1 + j];
-        }
-      }
-    }
-  }
-}
-
-// Ho = 4, Wo = 1, Co = 1
-void Conv2dNeonK15x1S1(const float *input,
-                       const float *filter,
-                       const index_t *in_shape,
-                       const index_t *out_shape,
-                       float *output) {
-  const index_t in_image_size = in_shape[2] * in_shape[3];
-  const index_t out_image_size = out_shape[2] * out_shape[3];
-  const index_t in_batch_size = in_shape[1] * in_image_size;
-  const index_t out_batch_size = out_shape[1] * out_image_size;
-  const index_t tile_width =
-      out_shape[1] < 4 ? RoundUpDiv4(out_shape[3]) : out_shape[3];
-
-#pragma omp parallel for collapse(3) schedule(runtime)
-  for (index_t b = 0; b < out_shape[0]; ++b) {
-    for (index_t m = 0; m < out_shape[1]; ++m) {
-      for (index_t w = 0; w < out_shape[3]; w += tile_width) {
-        const index_t out_height = out_shape[2];
-        const index_t out_width = out_shape[3];
-        const index_t in_channels = in_shape[1];
-        const index_t in_width = in_shape[3];
-        float *out_ptr_base = output + b * out_batch_size + m * out_image_size;
-        for (index_t c = 0; c < in_channels; ++c) {
-          const float *in_ptr_base =
-              input + b * in_batch_size + c * in_image_size;
-          const float *filter_ptr = filter + m * in_channels * 15 + c * 15;
-#if defined(MACE_ENABLE_NEON) && !defined(__aarch64__)
-          /* load filter (1 outch x 4 height x 1 width) */
-          float32x4_t vf0, vf1, vf2, vf3;
-          vf0 = vld1q_f32(filter_ptr);
-          vf1 = vld1q_f32(filter_ptr + 4);
-          vf2 = vld1q_f32(filter_ptr + 8);
-          vf3 = vld1q_f32(filter_ptr + 11);
-
-          for (index_t h = 0; h + 3 < out_height; h += 4) {
-            for (index_t wt = 0; wt < tile_width && w + wt < out_width; ++wt) {
-              // load output
-              index_t out_offset = h * out_width + w + wt;
-              // output (1 outch x 4 height x 1 width): vo_outch_height
-              float32x4_t vo = {out_ptr_base[out_offset],
-                                out_ptr_base[out_offset + out_width],
-                                out_ptr_base[out_offset + 2 * out_width],
-                                out_ptr_base[out_offset + 3 * out_width]};
-
-              // input offset
-              index_t in_offset = h * in_width + w + wt;
-              // input (3 slide)
-              float32x4_t vi0 = {in_ptr_base[in_offset],
-                                 in_ptr_base[in_offset + in_width],
-                                 in_ptr_base[in_offset + 2 * in_width],
-                                 in_ptr_base[in_offset + 3 * in_width]};
-              float32x4_t vi4 = {in_ptr_base[in_offset + 4 * in_width],
-                                 in_ptr_base[in_offset + 5 * in_width],
-                                 in_ptr_base[in_offset + 6 * in_width],
-                                 in_ptr_base[in_offset + 7 * in_width]};
-              float32x4_t vi8 = {in_ptr_base[in_offset + 8 * in_width],
-                                 in_ptr_base[in_offset + 9 * in_width],
-                                 in_ptr_base[in_offset + 10 * in_width],
-                                 in_ptr_base[in_offset + 11 * in_width]};
-              float32x4_t vi12 = {in_ptr_base[in_offset + 12 * in_width],
-                                  in_ptr_base[in_offset + 13 * in_width],
-                                  in_ptr_base[in_offset + 14 * in_width],
-                                  in_ptr_base[in_offset + 15 * in_width]};
-              float32x4_t vi16 = {in_ptr_base[in_offset + 16 * in_width],
-                                  in_ptr_base[in_offset + 17 * in_width]};
-              float32x4_t vi1 = vextq_f32(vi0, vi4, 1);
-              float32x4_t vi2 = vextq_f32(vi0, vi4, 2);
-              float32x4_t vi3 = vextq_f32(vi0, vi4, 3);
-              float32x4_t vi5 = vextq_f32(vi4, vi8, 1);
-              float32x4_t vi6 = vextq_f32(vi4, vi8, 2);
-              float32x4_t vi7 = vextq_f32(vi4, vi8, 3);
-              float32x4_t vi9 = vextq_f32(vi8, vi12, 1);
-              float32x4_t vi10 = vextq_f32(vi8, vi12, 2);
-              float32x4_t vi11 = vextq_f32(vi8, vi12, 3);
-              float32x4_t vi13 = vextq_f32(vi12, vi16, 1);
-              float32x4_t vi14 = vextq_f32(vi12, vi16, 2);
-
-              vo = vmlaq_lane_f32(vo, vi0, vget_low_f32(vf0), 0);
-              vo = vmlaq_lane_f32(vo, vi1, vget_low_f32(vf0), 1);
-              vo = vmlaq_lane_f32(vo, vi2, vget_high_f32(vf0), 0);
-              vo = vmlaq_lane_f32(vo, vi3, vget_high_f32(vf0), 1);
-              vo = vmlaq_lane_f32(vo, vi4, vget_low_f32(vf1), 0);
-              vo = vmlaq_lane_f32(vo, vi5, vget_low_f32(vf1), 1);
-              vo = vmlaq_lane_f32(vo, vi6, vget_high_f32(vf1), 0);
-              vo = vmlaq_lane_f32(vo, vi7, vget_high_f32(vf1), 1);
-              vo = vmlaq_lane_f32(vo, vi8, vget_low_f32(vf2), 0);
-              vo = vmlaq_lane_f32(vo, vi9, vget_low_f32(vf2), 1);
-              vo = vmlaq_lane_f32(vo, vi10, vget_high_f32(vf2), 0);
-              vo = vmlaq_lane_f32(vo, vi11, vget_high_f32(vf2), 1);
-              vo = vmlaq_lane_f32(vo, vi12, vget_low_f32(vf3), 1);
-              vo = vmlaq_lane_f32(vo, vi13, vget_high_f32(vf3), 0);
-              vo = vmlaq_lane_f32(vo, vi14, vget_high_f32(vf3), 1);
-
-              out_ptr_base[out_offset] = vo[0];
-              out_ptr_base[out_offset + out_width] = vo[1];
-              out_ptr_base[out_offset + 2 * out_width] = vo[2];
-              out_ptr_base[out_offset + 3 * out_width] = vo[3];
-            }  // wt
-          }    // h
-#else
-          Conv2dCPUK15x1Calc(in_ptr_base, filter_ptr, in_width, in_channels,
-                             out_height, out_width, w, tile_width,
-                             out_image_size, out_ptr_base, 0, 1);
-#endif
-        }  // c
-      }    // w
-    }      // m
-  }        // b
-}
-
-}  // namespace ops
-}  // namespace mace
--- a/mace/ops/arm/conv_2d_neon_1x15.cc
+++ b/mace/ops/arm/conv_2d_neon_1x15.cc
-// Copyright 2018 The MACE Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#if defined(MACE_ENABLE_NEON)
-#include <arm_neon.h>
-#endif
-
-#include "mace/ops/arm/conv_2d_neon.h"
-#include "mace/utils/logging.h"
-#include "mace/utils/math.h"
-
-namespace mace {
-namespace ops {
-
-inline void Conv2dCPUK1x15Calc(const float *in_ptr,
-                               const float *filter_ptr,
-                               const index_t in_width,
-                               const index_t in_channels,
-                               const index_t out_height,
-                               const index_t h,
-                               const index_t tile_height,
-                               const index_t out_width,
-                               const index_t out_image_size,
-                               float *out_ptr,
-                               const index_t io,
-                               const int stride) {
-  for (index_t ih = 0; ih < tile_height && h + ih < out_height; ++ih) {
-    for (index_t iw = 0; iw < out_width; ++iw) {
-      for (int i = 0; i < 1; ++i) {
-        for (int j = 0; j < 15; ++j) {
-          out_ptr[io * out_image_size + (h + ih) * out_width + iw] +=
-              in_ptr[((h + ih) * stride + i) * in_width + (iw * stride + j)] *
-              filter_ptr[io * in_channels * 15 + i * 15 + j];
-        }
-      }
-    }
-  }
-}
-
-// Ho = 1, Wo = 4, Co = 1
-void Conv2dNeonK1x15S1(const float *input,
-                       const float *filter,
-                       const index_t *in_shape,
-                       const index_t *out_shape,
-                       float *output) {
-  const index_t in_image_size = in_shape[2] * in_shape[3];
-  const index_t out_image_size = out_shape[2] * out_shape[3];
-  const index_t in_batch_size = in_shape[1] * in_image_size;
-  const index_t out_batch_size = out_shape[1] * out_image_size;
-  const index_t tile_height =
-      out_shape[1] < 4 ? RoundUpDiv4(out_shape[2]) : out_shape[2];
-
-#pragma omp parallel for collapse(3) schedule(runtime)
-  for (index_t b = 0; b < out_shape[0]; ++b) {
-    for (index_t m = 0; m < out_shape[1]; ++m) {
-      for (index_t h = 0; h < out_shape[2]; h += tile_height) {
-        const index_t out_height = out_shape[2];
-        const index_t out_width = out_shape[3];
-        const index_t in_channels = in_shape[1];
-        const index_t in_width = in_shape[3];
-        float *out_ptr_base = output + b * out_batch_size + m * out_image_size;
-        for (index_t c = 0; c < in_channels; ++c) {
-          const float *in_ptr_base =
-              input + b * in_batch_size + c * in_image_size;
-          const float *filter_ptr = filter + m * in_channels * 15 + c * 15;
-#if defined(MACE_ENABLE_NEON) && !defined(__aarch64__)
-          /* load filter (1 outch x 4 height x 1 width) */
-          float32x4_t vf0, vf1, vf2, vf3;
-          vf0 = vld1q_f32(filter_ptr);
-          vf1 = vld1q_f32(filter_ptr + 4);
-          vf2 = vld1q_f32(filter_ptr + 8);
-          vf3 = vld1q_f32(filter_ptr + 11);
-
-          for (index_t ht = 0; ht < tile_height && h + ht < out_height; ++ht) {
-            for (index_t w = 0; w + 3 < out_width; w += 4) {
-              // output (1 outch x 1 height x 4 width): vo_outch_height
-              float32x4_t vo;
-              // load output
-              index_t out_offset = (h + ht) * out_width + w;
-              vo = vld1q_f32(out_ptr_base + out_offset);
-
-              // input (3 slide)
-              float32x4_t vi0, vi1, vi2, vi3, vi4, vi5, vi6, vi7, vi8, vi9,
-                  vi10, vi11, vi12, vi13, vi14, vi16;
-              // input offset
-              index_t in_offset = (h + ht) * in_width + w;
-              // load input
-              vi0 = vld1q_f32(in_ptr_base + in_offset);
-              vi4 = vld1q_f32(in_ptr_base + in_offset + 4);
-              vi8 = vld1q_f32(in_ptr_base + in_offset + 8);
-              vi12 = vld1q_f32(in_ptr_base + in_offset + 12);
-              vi16 = vld1q_f32(in_ptr_base + in_offset + 16);
-              vi1 = vextq_f32(vi0, vi4, 1);
-              vi2 = vextq_f32(vi0, vi4, 2);
-              vi3 = vextq_f32(vi0, vi4, 3);
-              vi5 = vextq_f32(vi4, vi8, 1);
-              vi6 = vextq_f32(vi4, vi8, 2);
-              vi7 = vextq_f32(vi4, vi8, 3);
-              vi9 = vextq_f32(vi8, vi12, 1);
-              vi10 = vextq_f32(vi8, vi12, 2);
-              vi11 = vextq_f32(vi8, vi12, 3);
-              vi13 = vextq_f32(vi12, vi16, 1);
-              vi14 = vextq_f32(vi12, vi16, 2);
-
-              vo = vmlaq_lane_f32(vo, vi0, vget_low_f32(vf0), 0);
-              vo = vmlaq_lane_f32(vo, vi1, vget_low_f32(vf0), 1);
-              vo = vmlaq_lane_f32(vo, vi2, vget_high_f32(vf0), 0);
-              vo = vmlaq_lane_f32(vo, vi3, vget_high_f32(vf0), 1);
-              vo = vmlaq_lane_f32(vo, vi4, vget_low_f32(vf1), 0);
-              vo = vmlaq_lane_f32(vo, vi5, vget_low_f32(vf1), 1);
-              vo = vmlaq_lane_f32(vo, vi6, vget_high_f32(vf1), 0);
-              vo = vmlaq_lane_f32(vo, vi7, vget_high_f32(vf1), 1);
-              vo = vmlaq_lane_f32(vo, vi8, vget_low_f32(vf2), 0);
-              vo = vmlaq_lane_f32(vo, vi9, vget_low_f32(vf2), 1);
-              vo = vmlaq_lane_f32(vo, vi10, vget_high_f32(vf2), 0);
-              vo = vmlaq_lane_f32(vo, vi11, vget_high_f32(vf2), 1);
-              vo = vmlaq_lane_f32(vo, vi12, vget_low_f32(vf3), 1);
-              vo = vmlaq_lane_f32(vo, vi13, vget_high_f32(vf3), 0);
-              vo = vmlaq_lane_f32(vo, vi14, vget_high_f32(vf3), 1);
-
-              vst1q_f32(out_ptr_base + out_offset, vo);
-            }  // w
-          }    // ht
-#else
-          Conv2dCPUK1x15Calc(in_ptr_base, filter_ptr, in_width, in_channels,
-                             out_height, h, tile_height, out_width,
-                             out_image_size, out_ptr_base, 0, 1);
-#endif
-        }  // c
-      }    // h
-    }      // m
-  }        // b
-}
-
-}  // namespace ops
-}  // namespace mace
--- a/mace/ops/arm/conv_2d_neon_1x7.cc
+++ b/mace/ops/arm/conv_2d_neon_1x7.cc
-// Copyright 2018 The MACE Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#if defined(MACE_ENABLE_NEON)
-#include <arm_neon.h>
-#endif
-
-#include "mace/ops/arm/conv_2d_neon.h"
-
-namespace mace {
-namespace ops {
-
-// Ho = 1, Wo = 4, Co = 4
-void Conv2dNeonK1x7S1(const float *input,
-                      const float *filter,
-                      const index_t *in_shape,
-                      const index_t *out_shape,
-                      float *output) {
-  const index_t in_image_size = in_shape[2] * in_shape[3];
-  const index_t out_image_size = out_shape[2] * out_shape[3];
-  const index_t in_batch_size = in_shape[1] * in_image_size;
-  const index_t out_batch_size = out_shape[1] * out_image_size;
-
-#pragma omp parallel for collapse(2) schedule(runtime)
-  for (index_t b = 0; b < out_shape[0]; ++b) {
-    for (index_t m = 0; m < out_shape[1]; m += 4) {
-      const index_t out_channels = out_shape[1];
-      const index_t out_height = out_shape[2];
-      const index_t out_width = out_shape[3];
-      const index_t in_channels = in_shape[1];
-      const index_t in_width = in_shape[3];
-      if (m + 3 < out_channels) {
-        float *out_ptr0_base = output + b * out_batch_size + m * out_image_size;
-#if defined(MACE_ENABLE_NEON)
-        float *out_ptr1_base =
-            output + b * out_batch_size + (m + 1) * out_image_size;
-        float *out_ptr2_base =
-            output + b * out_batch_size + (m + 2) * out_image_size;
-        float *out_ptr3_base =
-            output + b * out_batch_size + (m + 3) * out_image_size;
-#endif
-        for (index_t c = 0; c < in_channels; ++c) {
-          const float *in_ptr_base =
-              input + b * in_batch_size + c * in_image_size;
-          const float *filter_ptr0 = filter + m * in_channels * 7 + c * 7;
-#if defined(MACE_ENABLE_NEON)
-          const float *filter_ptr1 = filter + (m + 1) * in_channels * 7 + c * 7;
-          const float *filter_ptr2 = filter + (m + 2) * in_channels * 7 + c * 7;
-          const float *filter_ptr3 = filter + (m + 3) * in_channels * 7 + c * 7;
-          /* load filter (4 outch x 1 height x 4 width) */
-          float32x4_t vf00, vf01;
-          float32x4_t vf10, vf11;
-          float32x4_t vf20, vf21;
-          float32x4_t vf30, vf31;
-          vf00 = vld1q_f32(filter_ptr0);
-          vf01 = vld1q_f32(filter_ptr0 + 3);
-          vf10 = vld1q_f32(filter_ptr1);
-          vf11 = vld1q_f32(filter_ptr1 + 3);
-          vf20 = vld1q_f32(filter_ptr2);
-          vf21 = vld1q_f32(filter_ptr2 + 3);
-          vf30 = vld1q_f32(filter_ptr3);
-          vf31 = vld1q_f32(filter_ptr3 + 3);
-
-          for (index_t h = 0; h < out_height; ++h) {
-            for (index_t w = 0; w + 3 < out_width; w += 4) {
-              // output (4 outch x 1 height x 4 width): vo_outch_height
-              float32x4_t vo0, vo1, vo2, vo3;
-              // load output
-              index_t out_offset = h * out_width + w;
-              vo0 = vld1q_f32(out_ptr0_base + out_offset);
-              vo1 = vld1q_f32(out_ptr1_base + out_offset);
-              vo2 = vld1q_f32(out_ptr2_base + out_offset);
-              vo3 = vld1q_f32(out_ptr3_base + out_offset);
-
-              // input (3 slide)
-              float32x4_t vi0, vi1, vi2, vi3, vi4, vi5, vi6, vi8;
-              // input offset
-              index_t in_offset = h * in_width + w;
-              // load input
-              vi0 = vld1q_f32(in_ptr_base + in_offset);
-              vi4 = vld1q_f32(in_ptr_base + in_offset + 4);
-              vi8 = vld1q_f32(in_ptr_base + in_offset + 8);
-              vi1 = vextq_f32(vi0, vi4, 1);
-              vi2 = vextq_f32(vi0, vi4, 2);
-              vi3 = vextq_f32(vi0, vi4, 3);
-              vi5 = vextq_f32(vi4, vi8, 1);
-              vi6 = vextq_f32(vi4, vi8, 2);
-
-#if defined(__aarch64__)
-              /* outch 0 */
-              vo0 = vfmaq_laneq_f32(vo0, vi0, vf00, 0);
-              vo0 = vfmaq_laneq_f32(vo0, vi1, vf00, 1);
-              vo0 = vfmaq_laneq_f32(vo0, vi2, vf00, 2);
-              vo0 = vfmaq_laneq_f32(vo0, vi3, vf00, 3);
-              vo0 = vfmaq_laneq_f32(vo0, vi4, vf01, 1);
-              vo0 = vfmaq_laneq_f32(vo0, vi5, vf01, 2);
-              vo0 = vfmaq_laneq_f32(vo0, vi6, vf01, 3);
-              /* outch 1 */
-              vo1 = vfmaq_laneq_f32(vo1, vi0, vf10, 0);
-              vo1 = vfmaq_laneq_f32(vo1, vi1, vf10, 1);
-              vo1 = vfmaq_laneq_f32(vo1, vi2, vf10, 2);
-              vo1 = vfmaq_laneq_f32(vo1, vi3, vf10, 3);
-              vo1 = vfmaq_laneq_f32(vo1, vi4, vf11, 1);
-              vo1 = vfmaq_laneq_f32(vo1, vi5, vf11, 2);
-              vo1 = vfmaq_laneq_f32(vo1, vi6, vf11, 3);
-              /* outch 2 */
-              vo2 = vfmaq_laneq_f32(vo2, vi0, vf20, 0);
-              vo2 = vfmaq_laneq_f32(vo2, vi1, vf20, 1);
-              vo2 = vfmaq_laneq_f32(vo2, vi2, vf20, 2);
-              vo2 = vfmaq_laneq_f32(vo2, vi3, vf20, 3);
-              vo2 = vfmaq_laneq_f32(vo2, vi4, vf21, 1);
-              vo2 = vfmaq_laneq_f32(vo2, vi5, vf21, 2);
-              vo2 = vfmaq_laneq_f32(vo2, vi6, vf21, 3);
-              /* outch 3 */
-              vo3 = vfmaq_laneq_f32(vo3, vi0, vf30, 0);
-              vo3 = vfmaq_laneq_f32(vo3, vi1, vf30, 1);
-              vo3 = vfmaq_laneq_f32(vo3, vi2, vf30, 2);
-              vo3 = vfmaq_laneq_f32(vo3, vi3, vf30, 3);
-              vo3 = vfmaq_laneq_f32(vo3, vi4, vf31, 1);
-              vo3 = vfmaq_laneq_f32(vo3, vi5, vf31, 2);
-              vo3 = vfmaq_laneq_f32(vo3, vi6, vf31, 3);
-#else
-              /* outch 0 */
-              vo0 = vmlaq_lane_f32(vo0, vi0, vget_low_f32(vf00), 0);
-              vo0 = vmlaq_lane_f32(vo0, vi1, vget_low_f32(vf00), 1);
-              vo0 = vmlaq_lane_f32(vo0, vi2, vget_high_f32(vf00), 0);
-              vo0 = vmlaq_lane_f32(vo0, vi3, vget_high_f32(vf00), 1);
-              vo0 = vmlaq_lane_f32(vo0, vi4, vget_low_f32(vf01), 1);
-              vo0 = vmlaq_lane_f32(vo0, vi5, vget_high_f32(vf01), 0);
-              vo0 = vmlaq_lane_f32(vo0, vi6, vget_high_f32(vf01), 1);
-              /* outch 1 */
-              vo1 = vmlaq_lane_f32(vo1, vi0, vget_low_f32(vf10), 0);
-              vo1 = vmlaq_lane_f32(vo1, vi1, vget_low_f32(vf10), 1);
-              vo1 = vmlaq_lane_f32(vo1, vi2, vget_high_f32(vf10), 0);
-              vo1 = vmlaq_lane_f32(vo1, vi3, vget_high_f32(vf10), 1);
-              vo1 = vmlaq_lane_f32(vo1, vi4, vget_low_f32(vf11), 1);
-              vo1 = vmlaq_lane_f32(vo1, vi5, vget_high_f32(vf11), 0);
-              vo1 = vmlaq_lane_f32(vo1, vi6, vget_high_f32(vf11), 1);
-              /* outch 2 */
-              vo2 = vmlaq_lane_f32(vo2, vi0, vget_low_f32(vf20), 0);
-              vo2 = vmlaq_lane_f32(vo2, vi1, vget_low_f32(vf20), 1);
-              vo2 = vmlaq_lane_f32(vo2, vi2, vget_high_f32(vf20), 0);
-              vo2 = vmlaq_lane_f32(vo2, vi3, vget_high_f32(vf20), 1);
-              vo2 = vmlaq_lane_f32(vo2, vi4, vget_low_f32(vf21), 1);
-              vo2 = vmlaq_lane_f32(vo2, vi5, vget_high_f32(vf21), 0);
-              vo2 = vmlaq_lane_f32(vo2, vi6, vget_high_f32(vf21), 1);
-              /* outch 3 */
-              vo3 = vmlaq_lane_f32(vo3, vi0, vget_low_f32(vf30), 0);
-              vo3 = vmlaq_lane_f32(vo3, vi1, vget_low_f32(vf30), 1);
-              vo3 = vmlaq_lane_f32(vo3, vi2, vget_high_f32(vf30), 0);
-              vo3 = vmlaq_lane_f32(vo3, vi3, vget_high_f32(vf30), 1);
-              vo3 = vmlaq_lane_f32(vo3, vi4, vget_low_f32(vf31), 1);
-              vo3 = vmlaq_lane_f32(vo3, vi5, vget_high_f32(vf31), 0);
-              vo3 = vmlaq_lane_f32(vo3, vi6, vget_high_f32(vf31), 1);
-#endif
-
-              vst1q_f32(out_ptr0_base + out_offset, vo0);
-              vst1q_f32(out_ptr1_base + out_offset, vo1);
-              vst1q_f32(out_ptr2_base + out_offset, vo2);
-              vst1q_f32(out_ptr3_base + out_offset, vo3);
-            }  // w
-          }    // h
-#else
-          for (index_t oc = 0; oc < 4; ++oc) {
-            Conv2dCPUKHxKWCalc(in_ptr_base, filter_ptr0 + oc * in_channels * 7,
-                               in_width, 1, 7, out_height, out_width,
-                               out_ptr0_base + oc * out_image_size, 1);
-          }
-#endif
-        }  // c
-      } else {
-        for (index_t mm = m; mm < out_channels; ++mm) {
-          float *out_ptr0_base =
-              output + b * out_batch_size + mm * out_image_size;
-          for (index_t c = 0; c < in_channels; ++c) {
-            const float *in_ptr_base =
-                input + b * in_batch_size + c * in_image_size;
-            const float *filter_ptr0 = filter + mm * in_channels * 7 + c * 7;
-#if defined(MACE_ENABLE_NEON)
-            /* load filter (1 outch x 1 height x 4 width) */
-            float32x4_t vf00, vf01;
-            vf00 = vld1q_f32(filter_ptr0);
-            vf01 = vld1q_f32(filter_ptr0 + 3);
-
-            for (index_t h = 0; h < out_height; ++h) {
-              for (index_t w = 0; w + 3 < out_width; w += 4) {
-                // output (1 outch x 1 height x 4 width): vo_outch_height
-                float32x4_t vo0;
-                // load output
-                index_t out_offset = h * out_width + w;
-                vo0 = vld1q_f32(out_ptr0_base + out_offset);
-
-                // input (3 slide)
-                float32x4_t vi0, vi1, vi2, vi3, vi4, vi5, vi6, vi8;
-                // input offset
-                index_t in_offset = h * in_width + w;
-                // load input
-                vi0 = vld1q_f32(in_ptr_base + in_offset);
-                vi4 = vld1q_f32(in_ptr_base + in_offset + 4);
-                vi8 = vld1q_f32(in_ptr_base + in_offset + 8);
-                vi1 = vextq_f32(vi0, vi4, 1);
-                vi2 = vextq_f32(vi0, vi4, 2);
-                vi3 = vextq_f32(vi0, vi4, 3);
-                vi5 = vextq_f32(vi4, vi8, 1);
-                vi6 = vextq_f32(vi4, vi8, 2);
-
-#if defined(__aarch64__)
-                vo0 = vfmaq_laneq_f32(vo0, vi0, vf00, 0);
-                vo0 = vfmaq_laneq_f32(vo0, vi1, vf00, 1);
-                vo0 = vfmaq_laneq_f32(vo0, vi2, vf00, 2);
-                vo0 = vfmaq_laneq_f32(vo0, vi3, vf00, 3);
-                vo0 = vfmaq_laneq_f32(vo0, vi4, vf01, 1);
-                vo0 = vfmaq_laneq_f32(vo0, vi5, vf01, 2);
-                vo0 = vfmaq_laneq_f32(vo0, vi6, vf01, 3);
-#else
-                vo0 = vmlaq_lane_f32(vo0, vi0, vget_low_f32(vf00), 0);
-                vo0 = vmlaq_lane_f32(vo0, vi1, vget_low_f32(vf00), 1);
-                vo0 = vmlaq_lane_f32(vo0, vi2, vget_high_f32(vf00), 0);
-                vo0 = vmlaq_lane_f32(vo0, vi3, vget_high_f32(vf00), 1);
-                vo0 = vmlaq_lane_f32(vo0, vi4, vget_low_f32(vf01), 1);
-                vo0 = vmlaq_lane_f32(vo0, vi5, vget_high_f32(vf01), 0);
-                vo0 = vmlaq_lane_f32(vo0, vi6, vget_high_f32(vf01), 1);
-#endif
-
-                vst1q_f32(out_ptr0_base + out_offset, vo0);
-              }  // w
-            }    // h
-#else
-            Conv2dCPUKHxKWCalc(in_ptr_base, filter_ptr0, in_width, 1, 7,
-                               out_height, out_width, out_ptr0_base, 1);
-#endif
-          }  // c
-        }
-      }  // if
-    }    // m
-  }      // b
-}
-
-}  // namespace ops
-}  // namespace mace
--- a/mace/ops/arm/deconv_2d_neon.h
+++ b/mace/ops/arm/deconv_2d_neon.h
-// Copyright 2018 The MACE Authors. All Rights Reserved.
+// Copyright 2019 The MACE Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.

--- a/mace/ops/arm/deconv_2d_neon_2x2.cc
+++ b/mace/ops/arm/deconv_2d_neon_2x2.cc
-// Copyright 2018 The MACE Authors. All Rights Reserved.
+// Copyright 2019 The MACE Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.

--- a/mace/ops/arm/deconv_2d_neon_3x3.cc
+++ b/mace/ops/arm/deconv_2d_neon_3x3.cc
-// Copyright 2018 The MACE Authors. All Rights Reserved.
+// Copyright 2019 The MACE Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.

--- a/mace/ops/arm/deconv_2d_neon_4x4.cc
+++ b/mace/ops/arm/deconv_2d_neon_4x4.cc
-// Copyright 2018 The MACE Authors. All Rights Reserved.
+// Copyright 2019 The MACE Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.

--- a/mace/ops/arm/depthwise_conv2d_neon.h
+++ b/mace/ops/arm/depthwise_conv2d_neon.h
-// Copyright 2018 The MACE Authors. All Rights Reserved.
+// Copyright 2019 The MACE Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.

--- a/mace/ops/arm/depthwise_conv2d_neon_3x3.cc
+++ b/mace/ops/arm/depthwise_conv2d_neon_3x3.cc
-// Copyright 2018 The MACE Authors. All Rights Reserved.
+// Copyright 2019 The MACE Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.

--- a/mace/ops/arm/depthwise_deconv2d_neon.h
+++ b/mace/ops/arm/depthwise_deconv2d_neon.h
-// Copyright 2018 The MACE Authors. All Rights Reserved.
+// Copyright 2019 The MACE Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.

--- a/mace/ops/arm/depthwise_deconv2d_neon_3x3.cc
+++ b/mace/ops/arm/depthwise_deconv2d_neon_3x3.cc
-// Copyright 2018 The MACE Authors. All Rights Reserved.
+// Copyright 2019 The MACE Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.

--- a/mace/ops/arm/depthwise_deconv2d_neon_4x4.cc
+++ b/mace/ops/arm/depthwise_deconv2d_neon_4x4.cc
-// Copyright 2018 The MACE Authors. All Rights Reserved.
+// Copyright 2019 The MACE Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.

--- a/mace/ops/arm/fp32/conv_2d.cc
+++ b/mace/ops/arm/fp32/conv_2d.cc
+// Copyright 2019 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <memory>
+#include <utility>
+#include <algorithm>
+
+#include "mace/ops/arm/fp32/conv_2d.h"
+#include "mace/utils/memory.h"
+
+namespace mace {
+namespace ops {
+namespace arm {
+namespace fp32 {
+
+void Conv2dBase::CalOutputShapeAndPadSize(const Tensor *input,
+                                          const Tensor *filter,
+                                          const int out_tile_height,
+                                          const int out_tile_width,
+                                          std::vector<index_t> *output_shape,
+                                          std::vector<int> *in_pad_size,
+                                          std::vector<int> *out_pad_size) {
+  in_pad_size->resize(4);
+  out_pad_size->resize(4);
+  output_shape->resize(4);
+
+  const index_t in_height = input->dim(2);
+  const index_t in_width = input->dim(3);
+
+  const index_t stride_h = strides_[0];
+  const index_t stride_w = strides_[1];
+  const index_t dilation_h = dilations_[0];
+  const index_t dilation_w = dilations_[1];
+  const index_t filter_h = filter->dim(2);
+  const index_t filter_w = filter->dim(3);
+
+  std::vector<int> paddings(2);
+  if (paddings_.empty()) {
+    CalcNCHWPaddingAndOutputSize(input->shape().data(),
+                                 filter->shape().data(),
+                                 dilations_.data(),
+                                 strides_.data(),
+                                 padding_type_,
+                                 output_shape->data(),
+                                 paddings.data());
+  } else {
+    paddings = paddings_;
+    CalcNCHWOutputSize(input->shape().data(),
+                       filter->shape().data(),
+                       paddings_.data(),
+                       dilations_.data(),
+                       strides_.data(),
+                       RoundType::FLOOR,
+                       output_shape->data());
+  }
+  const index_t out_height = (*output_shape)[2];
+  const index_t out_width = (*output_shape)[3];
+  const index_t
+      padded_out_height = RoundUp<index_t>(out_height, out_tile_height);
+  const index_t padded_out_width = RoundUp<index_t>(out_width, out_tile_width);
+  const index_t padded_in_height =
+      std::max(in_height + paddings[0], (padded_out_height - 1) * stride_h
+          + (filter_h - 1) * dilation_h + 1);
+  const index_t padded_in_width =
+      std::max(in_width + paddings[1], (padded_out_width - 1) * stride_w
+          + (filter_w - 1) * dilation_w + 1);
+
+  (*in_pad_size)[0] = paddings[0] >> 1;
+  (*in_pad_size)[1] =
+      static_cast<int>(padded_in_height - in_height - (*in_pad_size)[0]);
+  (*in_pad_size)[2] = paddings[1] >> 1;
+  (*in_pad_size)[3] =
+      static_cast<int>(padded_in_width - in_width - (*in_pad_size)[2]);
+
+  (*out_pad_size)[0] = 0;
+  (*out_pad_size)[1] = static_cast<int>(padded_out_height - out_height);
+  (*out_pad_size)[2] = 0;
+  (*out_pad_size)[3] = static_cast<int>(padded_out_width - out_width);
+}
+
+MaceStatus Conv2dBase::ResizeOutAndPadInOut(const OpContext *context,
+                                            const Tensor *input,
+                                            const Tensor *filter,
+                                            Tensor *output,
+                                            const int out_tile_height,
+                                            const int out_tile_width,
+                                            std::unique_ptr<const Tensor>
+                                                *padded_input,
+                                            std::unique_ptr<Tensor>
+                                                *padded_output) {
+  std::vector<index_t> output_shape;
+  std::vector<int> in_pad_size;
+  std::vector<int> out_pad_size;
+  CalOutputShapeAndPadSize(input,
+                           filter,
+                           out_tile_height,
+                           out_tile_width,
+                           &output_shape,
+                           &in_pad_size,
+                           &out_pad_size);
+  MACE_RETURN_IF_ERROR(output->Resize(output_shape));
+
+  const index_t batch = input->dim(0);
+  const index_t in_channels = input->dim(1);
+  const index_t in_height = input->dim(2);
+  const index_t in_width = input->dim(3);
+  const index_t out_channels = output->dim(1);
+  const index_t out_height = output->dim(2);
+  const index_t out_width = output->dim(3);
+
+  const index_t padded_in_height = in_height + in_pad_size[0] + in_pad_size[1];
+  const index_t padded_in_width = in_width + in_pad_size[2] + in_pad_size[3];
+  const index_t
+      padded_out_height = out_height + out_pad_size[0] + out_pad_size[1];
+  const index_t
+      padded_out_width = out_width + out_pad_size[2] + out_pad_size[3];
+  const bool is_in_padded =
+      padded_in_height != in_height || padded_in_width != in_width;
+  const bool is_out_padded =
+      padded_out_height != out_height || padded_out_width != out_width;
+
+  auto scratch_buffer = context->device()->scratch_buffer();
+  const index_t padded_in_size =
+      MACE_EXTRA_BUFFER_PAD_SIZE + (is_in_padded ? PadAlignSize(
+          sizeof(float) * batch * in_channels * padded_in_height
+              * padded_in_width) : 0);
+  const index_t padded_out_size = is_out_padded ? PadAlignSize(
+      sizeof(float) * batch * out_channels * padded_out_height
+          * padded_out_width) : 0;
+
+  scratch_buffer->Rewind();
+  scratch_buffer->GrowSize(padded_in_size + padded_out_size);
+  if (is_in_padded) {
+    std::unique_ptr<Tensor>
+        padded_in =
+        make_unique<Tensor>(scratch_buffer->Scratch(padded_in_size),
+                            DataType::DT_FLOAT);
+    padded_in->Resize({batch, in_channels, padded_in_height, padded_in_width});
+    PadInput(*input, in_pad_size[0], in_pad_size[2], padded_in.get());
+    *padded_input = std::move(padded_in);
+  }
+  if (is_out_padded) {
+    std::unique_ptr<Tensor>
+    padded_out = make_unique<Tensor>(scratch_buffer->Scratch(padded_out_size),
+                                     DataType::DT_FLOAT);
+    padded_out->Resize({batch, out_channels, padded_out_height,
+                        padded_out_width});
+    *padded_output = std::move(padded_out);
+  }
+  return MaceStatus::MACE_SUCCESS;
+}
+
+void Conv2dBase::PadInput(const Tensor &src,
+                          const int pad_top,
+                          const int pad_left,
+                          mace::Tensor *dst) {
+  if (dst == &src) return;
+  const index_t batch = src.dim(0);
+  const index_t channels = src.dim(1);
+  const index_t height = src.dim(2);
+  const index_t width = src.dim(3);
+  const index_t padded_height = dst->dim(2);
+  const index_t padded_width = dst->dim(3);
+  const int pad_bottom = static_cast<int>(padded_height - height - pad_top);
+  const int pad_right = static_cast<int>(padded_width - width - pad_left);
+  auto in_data = src.data<float>();
+  auto padded_in_data = dst->mutable_data<float>();
+
+  const index_t img_size = height * width;
+  const index_t padded_img_size = padded_height * padded_width;
+
+#pragma omp parallel for collapse(2) schedule(runtime)
+  for (index_t b = 0; b < batch; ++b) {
+    for (index_t c = 0; c < channels; ++c) {
+      const index_t bc = b * channels + c;
+      const float *in_base = in_data + bc * img_size;
+      float *padded_in_base = padded_in_data + bc * padded_img_size;
+
+      memset(padded_in_base, 0, sizeof(float) * pad_top * padded_width);
+      padded_in_base += pad_top * padded_width;
+      for (index_t h = 0; h < height; ++h) {
+        memset(padded_in_base,
+               0,
+               sizeof(float) * pad_left);
+        memcpy(padded_in_base + pad_left,
+               in_base,
+               sizeof(float) * width);
+        memset(padded_in_base + pad_left + width,
+               0,
+               sizeof(float) * pad_right);
+        in_base += width;
+        padded_in_base += padded_width;
+      }
+      memset(padded_in_base, 0, sizeof(float) * pad_bottom * padded_width);
+    }
+  }
+}
+
+void Conv2dBase::UnPadOutput(const mace::Tensor &src, mace::Tensor *dst) {
+  if (dst == &src) return;
+  const index_t batch = dst->dim(0);
+  const index_t channels = dst->dim(1);
+  const index_t height = dst->dim(2);
+  const index_t width = dst->dim(3);
+  const index_t padded_height = src.dim(2);
+  const index_t padded_width = src.dim(3);
+
+  auto padded_out_data = src.data<float>();
+  auto out_data = dst->mutable_data<float>();
+
+  const index_t img_size = height * width;
+  const index_t padded_img_size = padded_height * padded_width;
+
+#pragma omp parallel for collapse(2) schedule(runtime)
+  for (index_t b = 0; b < batch; ++b) {
+    for (index_t c = 0; c < channels; ++c) {
+      const index_t bc = (b * channels + c);
+      float *out_base = out_data + bc * img_size;
+      const float *padded_out_base = padded_out_data + bc * padded_img_size;
+
+      for (index_t h = 0; h < height; ++h) {
+        memcpy(out_base,
+               padded_out_base,
+               sizeof(float) * width);
+        out_base += width;
+        padded_out_base += padded_width;
+      }  // h
+    }  // c
+  }  // b
+}
+
+}  // namespace fp32
+}  // namespace arm
+}  // namespace ops
+}  // namespace mace
+
--- a/mace/ops/arm/fp32/conv_2d.h
+++ b/mace/ops/arm/fp32/conv_2d.h
-// Copyright 2018 The MACE Authors. All Rights Reserved.
+// Copyright 2019 The MACE Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -15,10 +15,14 @@
 #ifndef MACE_OPS_ARM_FP32_CONV_2D_H_
 #define MACE_OPS_ARM_FP32_CONV_2D_H_

+#include <vector>
+#include <memory>
+
 #include "mace/public/mace.h"
 #include "mace/core/tensor.h"
 #include "mace/core/op_context.h"
 #include "mace/ops/arm/fp32/gemm.h"
+#include "mace/ops/common/conv_pool_2d_util.h"

 namespace mace {
 namespace ops {
@@ -27,13 +31,51 @@ namespace fp32 {

 class Conv2dBase {
 public:
-  Conv2dBase() = default;
+  Conv2dBase(const std::vector<int> strides,
+             const std::vector<int> dilations,
+             const std::vector<int> paddings,
+             const Padding padding_type)
+      : strides_(strides),
+        dilations_(dilations),
+        paddings_(paddings),
+        padding_type_(padding_type) {}
+
  virtual ~Conv2dBase() = default;
+
  virtual MaceStatus Compute(
      const OpContext *context,
      const Tensor *input,
      const Tensor *filter,
      Tensor *output) = 0;
+
+ protected:
+  void CalOutputShapeAndPadSize(const Tensor *input,
+                                const Tensor *filter,
+                                const int out_tile_height,
+                                const int out_tile_width,
+                                std::vector<index_t> *output_shape,
+                                std::vector<int> *in_pad_size,
+                                std::vector<int> *out_pad_size);
+
+  MaceStatus ResizeOutAndPadInOut(const OpContext *context,
+                                  const Tensor *input,
+                                  const Tensor *filter,
+                                  Tensor *output,
+                                  const int out_tile_height,
+                                  const int out_tile_width,
+                                  std::unique_ptr<const Tensor> *padded_input,
+                                  std::unique_ptr<Tensor> *padded_output);
+
+  void PadInput(const Tensor &src,
+                const int pad_top,
+                const int pad_left,
+                Tensor *dst);
+  void UnPadOutput(const Tensor &src, Tensor *dst);
+
+  const std::vector<int> strides_;
+  const std::vector<int> dilations_;
+  const std::vector<int> paddings_;
+  const Padding padding_type_;
 };

 }  // namespace fp32

--- a/mace/ops/arm/fp32/conv_2d_1x1.cc
+++ b/mace/ops/arm/fp32/conv_2d_1x1.cc
@@ -12,7 +12,6 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

-
 #include "mace/ops/arm/fp32/conv_2d_1x1.h"

 namespace mace {
@@ -25,20 +24,68 @@ MaceStatus Conv2dK1x1::Compute(const OpContext *context,
                               const Tensor *filter,
                               Tensor *output) {
  index_t batch = input->dim(0);
-  index_t height = input->dim(2);
-  index_t width = input->dim(3);
+  index_t in_height = input->dim(2);
+  index_t in_width = input->dim(3);
  index_t in_channels = input->dim(1);
-  index_t out_channels = filter->dim(0);
-  MACE_RETURN_IF_ERROR(output->Resize({batch, out_channels, height, width}));
-  context->device()->scratch_buffer()->Rewind();
+
+  std::vector<index_t> output_shape;
+  std::vector<int> in_pad_size;
+  std::vector<int> out_pad_size;
+  CalOutputShapeAndPadSize(input,
+                           filter,
+                           1,
+                           1,
+                           &output_shape,
+                           &in_pad_size,
+                           &out_pad_size);
+  MACE_RETURN_IF_ERROR(output->Resize(output_shape));
+
+  const index_t out_channels = output_shape[1];
+  const index_t out_height = output_shape[2];
+  const index_t out_width = output_shape[3];
+  const index_t padded_in_height = in_height + in_pad_size[0] + in_pad_size[1];
+  const index_t padded_in_width = in_width + in_pad_size[2] + in_pad_size[3];
+
+  // pad input and transform input
+  const bool is_in_padded =
+      in_height != padded_in_height || in_width != padded_in_width;
+  auto scratch_buffer = context->device()->scratch_buffer();
+  const index_t padded_in_size = is_in_padded ? PadAlignSize(
+      sizeof(float) * batch * in_channels * padded_in_height
+          * padded_in_width) : 0;
+  const index_t pack_filter_size =
+      PadAlignSize(sizeof(float) * out_channels * in_channels);
+  const index_t pack_input_size =
+      PadAlignSize(
+          sizeof(float) * in_channels * padded_in_height * padded_in_width);
+  const index_t pack_output_size =
+      PadAlignSize(
+          sizeof(float) * out_channels * padded_in_height * padded_in_width);
+
+  const index_t gemm_pack_size =
+      pack_filter_size + pack_input_size + pack_output_size;
+
+  scratch_buffer->Rewind();
+  scratch_buffer->GrowSize(padded_in_size + gemm_pack_size);
+
+  const Tensor *padded_in = input;
+  Tensor tmp_padded_in
+      (scratch_buffer->Scratch(padded_in_size), DataType::DT_FLOAT);
+  if (is_in_padded) {
+    tmp_padded_in.Resize({batch, in_channels, padded_in_height,
+                          padded_in_width});
+    PadInput(*input, in_pad_size[0], in_pad_size[2], &tmp_padded_in);
+    padded_in = &tmp_padded_in;
+  }
+
  return gemm_.Compute(context,
                       filter,
-                       input,
+                       padded_in,
                       batch,
                       out_channels,
                       in_channels,
                       in_channels,
-                       height * width,
+                       out_height * out_width,
                       false,
                       false,
                       false,

--- a/mace/ops/arm/fp32/conv_2d_1x1.h
+++ b/mace/ops/arm/fp32/conv_2d_1x1.h
-// Copyright 2018 The MACE Authors. All Rights Reserved.
+// Copyright 2019 The MACE Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -15,6 +15,7 @@
 #ifndef MACE_OPS_ARM_FP32_CONV_2D_1X1_H_
 #define MACE_OPS_ARM_FP32_CONV_2D_1X1_H_

+#include <vector>
 #include "mace/public/mace.h"
 #include "mace/core/tensor.h"
 #include "mace/core/op_context.h"
@@ -28,7 +29,8 @@ namespace fp32 {

 class Conv2dK1x1 : public Conv2dBase {
 public:
-  Conv2dK1x1() : gemm_(true) {}
+  Conv2dK1x1(const std::vector<int> paddings, const Padding padding_type)
+      : Conv2dBase({1, 1}, {1, 1}, paddings, padding_type) {}
  virtual ~Conv2dK1x1() {}

  MaceStatus Compute(

--- a/mace/ops/arm/conv_2d_neon_7x1.cc
+++ b/mace/ops/arm/conv_2d_neon_7x1.cc
--- a/mace/ops/arm/fp32/conv_2d_1xn.h
+++ b/mace/ops/arm/fp32/conv_2d_1xn.h
+// Copyright 2019 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef MACE_OPS_ARM_FP32_CONV_2D_1XN_H_
+#define MACE_OPS_ARM_FP32_CONV_2D_1XN_H_
+
+#include <vector>
+#include "mace/public/mace.h"
+#include "mace/core/tensor.h"
+#include "mace/core/op_context.h"
+#include "mace/ops/arm/fp32/conv_2d.h"
+
+namespace mace {
+namespace ops {
+namespace arm {
+namespace fp32 {
+
+class Conv2dK1x7S1 : public Conv2dBase {
+ public:
+  Conv2dK1x7S1(const std::vector<int> paddings, const Padding padding_type)
+      : Conv2dBase({1, 1}, {1, 1}, paddings, padding_type) {}
+  virtual ~Conv2dK1x7S1() {}
+
+  MaceStatus Compute(
+      const OpContext *context,
+      const Tensor *input,
+      const Tensor *filter,
+      Tensor *output);
+};
+
+class Conv2dK7x1S1 : public Conv2dBase {
+ public:
+  Conv2dK7x1S1(const std::vector<int> paddings, const Padding padding_type)
+      : Conv2dBase({1, 1}, {1, 1}, paddings, padding_type) {}
+  virtual ~Conv2dK7x1S1() {}
+
+  MaceStatus Compute(
+      const OpContext *context,
+      const Tensor *input,
+      const Tensor *filter,
+      Tensor *output);
+};
+
+class Conv2dK1x15S1 : public Conv2dBase {
+ public:
+  Conv2dK1x15S1(const std::vector<int> paddings, const Padding padding_type)
+      : Conv2dBase({1, 1}, {1, 1}, paddings, padding_type) {}
+  virtual ~Conv2dK1x15S1() {}
+
+  MaceStatus Compute(
+      const OpContext *context,
+      const Tensor *input,
+      const Tensor *filter,
+      Tensor *output);
+};
+
+class Conv2dK15x1S1 : public Conv2dBase {
+ public:
+  Conv2dK15x1S1(const std::vector<int> paddings, const Padding padding_type)
+      : Conv2dBase({1, 1}, {1, 1}, paddings, padding_type) {}
+  virtual ~Conv2dK15x1S1() {}
+
+  MaceStatus Compute(
+      const OpContext *context,
+      const Tensor *input,
+      const Tensor *filter,
+      Tensor *output);
+};
+
+}  // namespace fp32
+}  // namespace arm
+}  // namespace ops
+}  // namespace mace
+
+#endif  // MACE_OPS_ARM_FP32_CONV_2D_1XN_H_
--- a/mace/ops/arm/conv_2d_neon_3x3.cc
+++ b/mace/ops/arm/conv_2d_neon_3x3.cc
-// Copyright 2018 The MACE Authors. All Rights Reserved.
+// Copyright 2019 The MACE Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -12,22 +12,49 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

-#if defined(MACE_ENABLE_NEON)
 #include <arm_neon.h>
-#endif
-
-#include "mace/utils/macros.h"
-#include "mace/ops/arm/conv_2d_neon.h"
+#include <memory>
+#include "mace/ops/arm/fp32/conv_2d_3x3.h"

 namespace mace {
 namespace ops {
+namespace arm {
+namespace fp32 {
+
+MaceStatus Conv2dK3x3S1::Compute(const OpContext *context,
+                                 const Tensor *input,
+                                 const Tensor *filter,
+                                 Tensor *output) {
+  std::unique_ptr<const Tensor> padded_input;
+  std::unique_ptr<Tensor> padded_output;
+  ResizeOutAndPadInOut(context,
+                       input,
+                       filter,
+                       output,
+                       2,
+                       4,
+                       &padded_input,
+                       &padded_output);
+  const Tensor *in_tensor = input;
+  if (padded_input.get() != nullptr) {
+    in_tensor = padded_input.get();
+  }
+  Tensor *out_tensor = output;
+  if (padded_output.get() != nullptr) {
+    out_tensor = padded_output.get();
+  }
+  out_tensor->Clear();
+
+  Tensor::MappingGuard in_guard(input);
+  Tensor::MappingGuard filter_guard(filter);
+  Tensor::MappingGuard out_guard(output);
+  auto filter_data = filter->data<float>();
+  auto input_data = in_tensor->data<float>();
+  auto output_data = out_tensor->mutable_data<float>();
+
+  auto in_shape = in_tensor->shape();
+  auto out_shape = out_tensor->shape();

-// Ho = 2, Wo = 4, Co = 2
-void Conv2dNeonK3x3S1(const float *input,
-                      const float *filter,
-                      const index_t *in_shape,
-                      const index_t *out_shape,
-                      float *output) {
  const index_t in_image_size = in_shape[2] * in_shape[3];
  const index_t out_image_size = out_shape[2] * out_shape[3];
  const index_t in_batch_size = in_shape[1] * in_image_size;
@@ -42,26 +69,26 @@ void Conv2dNeonK3x3S1(const float *input,
      const index_t in_channels = in_shape[1];
      const index_t in_width = in_shape[3];
      if (m + 1 < out_channels) {
-        float *out_ptr0_base = output + b * out_batch_size + m * out_image_size;
-#if defined(MACE_ENABLE_NEON)
+        float *out_ptr0_base =
+            output_data + b * out_batch_size + m * out_image_size;
        float *out_ptr1_base =
-            output + b * out_batch_size + (m + 1) * out_image_size;
-#endif
+            output_data + b * out_batch_size + (m + 1) * out_image_size;
        for (index_t c = 0; c < in_channels; ++c) {
-          const float *in_ptr0 = input + b * in_batch_size + c * in_image_size;
-          const float *filter_ptr0 = filter + m * in_channels * 9 + c * 9;
+          const float
+              *in_ptr0 = input_data + b * in_batch_size + c * in_image_size;
+          const float *filter_ptr0 = filter_data + m * in_channels * 9 + c * 9;

-#if defined(MACE_ENABLE_NEON)
          float *out_ptr1 = out_ptr1_base;
          const float *in_ptr1 =
-              input + b * in_batch_size + c * in_image_size + 1 * in_width;
+              input_data + b * in_batch_size + c * in_image_size + 1 * in_width;
          const float *in_ptr2 =
-              input + b * in_batch_size + c * in_image_size + 2 * in_width;
+              input_data + b * in_batch_size + c * in_image_size + 2 * in_width;
          const float *in_ptr3 =
-              input + b * in_batch_size + c * in_image_size + 3 * in_width;
-          const float *filter_ptr1 = filter + (m + 1) * in_channels * 9 + c * 9;
-#endif
-#if defined(MACE_ENABLE_NEON) && defined(__aarch64__)
+              input_data + b * in_batch_size + c * in_image_size + 3 * in_width;
+          const float
+              *filter_ptr1 = filter_data + (m + 1) * in_channels * 9 + c * 9;
+
+#if defined(__aarch64__)
          float *out_ptr0 = out_ptr0_base;

          // load filter (2 outch x 3 height x 3 width): vf_outch_height
@@ -179,7 +206,7 @@ void Conv2dNeonK3x3S1(const float *input,
            out_ptr0 += out_width;
            out_ptr1 += out_width;
          }                      // h
-#elif defined(MACE_ENABLE_NEON)  // arm v7
+#else  // arm v7
          float *out_ptr0 = out_ptr0_base;

          // load filter (2 outch x 3 height x 3 width): vf_outch_height
@@ -301,32 +328,28 @@ void Conv2dNeonK3x3S1(const float *input,
            out_ptr0 += out_width;
            out_ptr1 += out_width;
          }  // h
-#else
-          for (index_t oc = 0; oc < 2; ++oc) {
-            Conv2dCPUKHxKWCalc(in_ptr0, filter_ptr0 + oc * in_channels * 9,
-                               in_width, 3, 3, out_height, out_width,
-                               out_ptr0_base + oc * out_image_size, 1);
-          }
 #endif
        }  // c
      } else {
        for (index_t mm = m; mm < out_channels; ++mm) {
          float *out_ptr0_base =
-              output + b * out_batch_size + mm * out_image_size;
+              output_data + b * out_batch_size + mm * out_image_size;
          for (index_t c = 0; c < in_channels; ++c) {
            const float *in_ptr0 =
-                input + b * in_batch_size + c * in_image_size;
-#if defined(MACE_ENABLE_NEON)
+                input_data + b * in_batch_size + c * in_image_size;
            const float *in_ptr1 =
-                input + b * in_batch_size + c * in_image_size + 1 * in_width;
+                input_data + b * in_batch_size + c * in_image_size
+                    + 1 * in_width;
            const float *in_ptr2 =
-                input + b * in_batch_size + c * in_image_size + 2 * in_width;
+                input_data + b * in_batch_size + c * in_image_size
+                    + 2 * in_width;
            const float *in_ptr3 =
-                input + b * in_batch_size + c * in_image_size + 3 * in_width;
-#endif
-            const float *filter_ptr0 = filter + mm * in_channels * 9 + c * 9;
+                input_data + b * in_batch_size + c * in_image_size
+                    + 3 * in_width;
+            const float
+                *filter_ptr0 = filter_data + mm * in_channels * 9 + c * 9;

-#if defined(MACE_ENABLE_NEON) && defined(__aarch64__)
+#if defined(__aarch64__)
            float *out_ptr0 = out_ptr0_base;

            // load filter (1 outch x 3 height x 3 width): vf_outch_height
@@ -409,7 +432,7 @@ void Conv2dNeonK3x3S1(const float *input,

              out_ptr0 += out_width;
            }                    // h
-#elif defined(MACE_ENABLE_NEON)  // arm v7
+#else  // arm v7
            float *out_ptr0 = out_ptr0_base;

            // load filter (1 outch x 3 height x 3 width): vf_outch_height
@@ -494,22 +517,52 @@ void Conv2dNeonK3x3S1(const float *input,

              out_ptr0 += out_width;
            }  // h
-#else
-            Conv2dCPUKHxKWCalc(in_ptr0, filter_ptr0, in_width, 3, 3, out_height,
-                               out_width, out_ptr0_base, 1);
 #endif
          }  // c
        }    // mm
      }      // if
    }        // m
  }          // b
+
+  UnPadOutput(*out_tensor, output);
+  return MaceStatus::MACE_SUCCESS;
 }

-void Conv2dNeonK3x3S2(const float *input,
-                      const float *filter,
-                      const index_t *in_shape,
-                      const index_t *out_shape,
-                      float *output) {
+MaceStatus Conv2dK3x3S2::Compute(const OpContext *context,
+                                 const Tensor *input,
+                                 const Tensor *filter,
+                                 Tensor *output) {
+  std::unique_ptr<const Tensor> padded_input;
+  std::unique_ptr<Tensor> padded_output;
+
+  ResizeOutAndPadInOut(context,
+                       input,
+                       filter,
+                       output,
+                       1,
+                       4,
+                       &padded_input,
+                       &padded_output);
+  const Tensor *in_tensor = input;
+  if (padded_input.get() != nullptr) {
+    in_tensor = padded_input.get();
+  }
+  Tensor *out_tensor = output;
+  if (padded_output.get() != nullptr) {
+    out_tensor = padded_output.get();
+  }
+  out_tensor->Clear();
+
+  Tensor::MappingGuard in_guard(input);
+  Tensor::MappingGuard filter_guard(filter);
+  Tensor::MappingGuard out_guard(output);
+  auto filter_data = filter->data<float>();
+  auto input_data = in_tensor->data<float>();
+  auto output_data = out_tensor->mutable_data<float>();
+
+  auto in_shape = in_tensor->shape();
+  auto out_shape = out_tensor->shape();
+
  const index_t in_image_size = in_shape[2] * in_shape[3];
  const index_t out_image_size = out_shape[2] * out_shape[3];
  const index_t in_batch_size = in_shape[1] * in_image_size;
@@ -523,11 +576,12 @@ void Conv2dNeonK3x3S2(const float *input,
        const index_t in_width = in_shape[3];
        const index_t out_height = out_shape[2];
        const index_t out_width = out_shape[3];
-        const float *in_base = input + b * in_batch_size + c * in_image_size;
-        const float *filter_ptr = filter + m * in_channels * 9 + c * 9;
-        float *out_base = output + b * out_batch_size + m * out_image_size;
+        const float
+            *in_base = input_data + b * in_batch_size + c * in_image_size;
+        const float *filter_ptr = filter_data + m * in_channels * 9 + c * 9;
+        float *out_base = output_data + b * out_batch_size + m * out_image_size;

-#if defined(MACE_ENABLE_NEON) && defined(__aarch64__)
+#if defined(__aarch64__)
        // load filter (1 outch x 3 height x 3 width): vf_outch_height
        float32x4_t vf00, vf01, vf02;
        vf00 = vld1q_f32(filter_ptr);
@@ -587,7 +641,7 @@ void Conv2dNeonK3x3S2(const float *input,
            vst1q_f32(out_base + out_offset, vo);
          }                      // w
        }                        // h
-#elif defined(MACE_ENABLE_NEON)  // arm v7
+#else  // arm v7
        // load filter (1 outch x 3 height x 3 width): vf_outch_height
        float32x2_t vf01, vf23, vf45, vf67, vf78;
        vf01 = vld1_f32(filter_ptr);
@@ -649,14 +703,16 @@ void Conv2dNeonK3x3S2(const float *input,
            vst1q_f32(out_base + out_offset, vo);
          }  // w
        }    // h
-#else
-        Conv2dCPUKHxKWCalc(in_base, filter_ptr, in_width, 3, 3, out_height,
-                           out_width, out_base, 2);
 #endif
      }  // c
    }    // m
  }      // b
+
+  UnPadOutput(*out_tensor, output);
+  return MaceStatus::MACE_SUCCESS;
 }

+}  // namespace fp32
+}  // namespace arm
 }  // namespace ops
 }  // namespace mace
--- a/mace/ops/arm/fp32/conv_2d_3x3.h
+++ b/mace/ops/arm/fp32/conv_2d_3x3.h
+// Copyright 2019 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef MACE_OPS_ARM_FP32_CONV_2D_3X3_H_
+#define MACE_OPS_ARM_FP32_CONV_2D_3X3_H_
+
+#include <vector>
+#include "mace/public/mace.h"
+#include "mace/core/tensor.h"
+#include "mace/core/op_context.h"
+#include "mace/ops/arm/fp32/conv_2d.h"
+
+namespace mace {
+namespace ops {
+namespace arm {
+namespace fp32 {
+
+class Conv2dK3x3S1 : public Conv2dBase {
+ public:
+  Conv2dK3x3S1(const std::vector<int> paddings, const Padding padding_type)
+      : Conv2dBase({1, 1}, {1, 1}, paddings, padding_type) {}
+  virtual ~Conv2dK3x3S1() {}
+
+  MaceStatus Compute(
+      const OpContext *context,
+      const Tensor *input,
+      const Tensor *filter,
+      Tensor *output);
+};
+
+class Conv2dK3x3S2 : public Conv2dBase {
+ public:
+  Conv2dK3x3S2(const std::vector<int> paddings, const Padding padding_type)
+      : Conv2dBase({2, 2}, {1, 1}, paddings, padding_type) {}
+  virtual ~Conv2dK3x3S2() {}
+
+  MaceStatus Compute(
+      const OpContext *context,
+      const Tensor *input,
+      const Tensor *filter,
+      Tensor *output);
+};
+
+}  // namespace fp32
+}  // namespace arm
+}  // namespace ops
+}  // namespace mace
+
+#endif  // MACE_OPS_ARM_FP32_CONV_2D_3X3_H_
--- a/mace/ops/arm/fp32/conv_2d_3x3_winograd.cc
+++ b/mace/ops/arm/fp32/conv_2d_3x3_winograd.cc
@@ -34,13 +34,6 @@ MaceStatus Conv2dK3x3Winograd::Compute(const OpContext *context,
  const index_t in_width = input->dim(3);
  const index_t out_channels = filter->dim(0);

-  index_t padded_in_height = in_height + pad_top_ + pad_bottom_;
-  index_t padded_in_width = in_width + pad_left_ + pad_right_;
-
-  index_t out_height = padded_in_height - 2;
-  index_t out_width = padded_in_width - 2;
-  output->Resize({batch, out_channels, out_height, out_width});
-
  // When size of input feature map is bigger than 16x16,
  // set winograd out tile size to 6 to get higher performance.
  index_t out_tile_size = 2;
@@ -48,10 +41,35 @@ MaceStatus Conv2dK3x3Winograd::Compute(const OpContext *context,
    out_tile_size = 6;
  }

-  const index_t padded_out_height = RoundUp<index_t>(out_height, out_tile_size);
-  const index_t padded_out_width = RoundUp<index_t>(out_width, out_tile_size);
-  padded_in_height = std::max(padded_in_height, padded_out_height + 2);
-  padded_in_width = std::max(padded_in_width, padded_out_width + 2);
+  std::vector<index_t> output_shape;
+  std::vector<int> in_pad_size;
+  std::vector<int> out_pad_size;
+  CalOutputShapeAndPadSize(input,
+                           filter,
+                           out_tile_size,
+                           out_tile_size,
+                           &output_shape,
+                           &in_pad_size,
+                           &out_pad_size);
+  MACE_RETURN_IF_ERROR(output->Resize(output_shape));
+
+  Tensor::MappingGuard filter_guard(filter);
+  Tensor::MappingGuard input_guard(input);
+  Tensor::MappingGuard output_guard(output);
+
+  const index_t out_height = output_shape[2];
+  const index_t out_width = output_shape[3];
+  const index_t padded_in_height = in_height + in_pad_size[0] + in_pad_size[1];
+  const index_t padded_in_width = in_width + in_pad_size[2] + in_pad_size[3];
+  const index_t
+      padded_out_height = out_height + out_pad_size[0] + out_pad_size[1];
+  const index_t
+      padded_out_width = out_width + out_pad_size[2] + out_pad_size[3];
+  const int pad_top = in_pad_size[0];
+  const int pad_left = in_pad_size[2];
+
+  bool is_in_padded =
+      padded_in_height != in_height || padded_in_width != in_width;
  bool is_out_padded =
      padded_out_height != out_height || padded_out_width != out_width;

@@ -63,8 +81,9 @@ MaceStatus Conv2dK3x3Winograd::Compute(const OpContext *context,

  // pad input and transform input
  auto scratch_buffer = context->device()->scratch_buffer();
-  const index_t padded_in_size = PadAlignSize(
-      sizeof(float) * batch * in_channels * padded_in_height * padded_in_width);
+  const index_t padded_in_size = is_in_padded ? PadAlignSize(
+      sizeof(float) * batch * in_channels * padded_in_height
+          * padded_in_width) : 0;
  const index_t padded_out_size = is_out_padded ? PadAlignSize(
      sizeof(float) * batch * out_channels * padded_out_height
          * padded_out_width) : 0;
@@ -81,8 +100,18 @@ MaceStatus Conv2dK3x3Winograd::Compute(const OpContext *context,
  scratch_buffer->GrowSize(
      padded_in_size + padded_out_size + transformed_in_size
          + transformed_out_size + gemm_pack_size);
-  Tensor padded_in(scratch_buffer->Scratch(padded_in_size), DataType::DT_FLOAT);
-  padded_in.Resize({batch, in_channels, padded_in_height, padded_in_width});
+
+  const Tensor *padded_in = input;
+  Tensor tmp_padded_in
+      (scratch_buffer->Scratch(padded_in_size), DataType::DT_FLOAT);
+  if (is_in_padded) {
+    tmp_padded_in.Resize({batch, in_channels, padded_in_height,
+                          padded_in_width});
+    Tensor::MappingGuard guard(&tmp_padded_in);
+    PadInput(*input, pad_top, pad_left, &tmp_padded_in);
+    padded_in = &tmp_padded_in;
+  }
+
  Tensor *padded_out = output;
  Tensor tmp_padded_out
      (scratch_buffer->Scratch(padded_out_size), DataType::DT_FLOAT);
@@ -94,22 +123,10 @@ MaceStatus Conv2dK3x3Winograd::Compute(const OpContext *context,

  auto transformed_in = scratch_buffer->Scratch(transformed_in_size);
  auto transformed_out = scratch_buffer->Scratch(transformed_out_size);
-
-  auto padded_in_data = padded_in.data<float>();
+  auto padded_in_data = padded_in->data<float>();
  auto padded_out_data = padded_out->mutable_data<float>();
  auto transformed_in_data = transformed_in.mutable_data<float>();
  auto transformed_out_data = transformed_out.mutable_data<float>();
-
-  const index_t padded_bottom = padded_in_height - in_height - pad_top_;
-  const index_t padded_right = padded_in_width - in_width - pad_left_;
-  ConstructNCHWInputWithSpecificPadding(input,
-                                        pad_top_,
-                                        padded_bottom,
-                                        pad_left_,
-                                        padded_right,
-                                        &padded_in);
-
-  Tensor::MappingGuard filter_guard(filter);
  auto filter_data = filter->data<float>();

  if (!filter->is_weight() || out_tile_size != out_tile_size_) {
@@ -215,47 +232,11 @@ MaceStatus Conv2dK3x3Winograd::Compute(const OpContext *context,
    default:MACE_NOT_IMPLEMENTED;
  }

-  if (is_out_padded) {
-    UnPackOutput(*padded_out, output);
-  }
+  UnPadOutput(*padded_out, output);

  return MaceStatus::MACE_SUCCESS;
 }

-void Conv2dK3x3Winograd::UnPackOutput(const Tensor &src, Tensor *dst) {
-  const index_t batch = dst->dim(0);
-  const index_t channels = dst->dim(1);
-  const index_t height = dst->dim(2);
-  const index_t width = dst->dim(3);
-  const index_t padded_height = src.dim(2);
-  const index_t padded_width = src.dim(3);
-
-  if (height == padded_height && width == padded_width) {
-    return;
-  }
-
-  auto padded_out_data = src.data<float>();
-  auto out_data = dst->mutable_data<float>();
-
-  const index_t img_size = height * width;
-  const index_t padded_img_size = padded_height * padded_width;
-
-#pragma omp parallel for collapse(3) schedule(runtime)
-  for (index_t b = 0; b < batch; ++b) {
-    for (index_t c = 0; c < channels; ++c) {
-      for (index_t h = 0; h < height; ++h) {
-        memcpy(
-            out_data + (b * channels + c) * img_size
-                + h * width,
-            padded_out_data
-                + (b * channels + c) * padded_img_size
-                + h * padded_width,
-            sizeof(float) * width);
-      }  // h
-    }  // c
-  }  // b
-}
-
 // OCHW => TOC
 void Conv2dK3x3Winograd::TransformFilter4x4(const float *filter,
                                            const index_t in_channels,

--- a/mace/ops/arm/fp32/conv_2d_3x3_winograd.h
+++ b/mace/ops/arm/fp32/conv_2d_3x3_winograd.h
@@ -15,6 +15,7 @@
 #ifndef MACE_OPS_ARM_FP32_CONV_2D_3X3_WINOGRAD_H_
 #define MACE_OPS_ARM_FP32_CONV_2D_3X3_WINOGRAD_H_

+#include <vector>
 #include <memory>

 #include "mace/public/mace.h"
@@ -30,12 +31,10 @@ namespace fp32 {

 class Conv2dK3x3Winograd : public Conv2dBase {
 public:
-  Conv2dK3x3Winograd(int pad_top, int pad_bottom, int pad_left, int pad_right)
-      : gemm_(),
-        pad_top_(pad_top),
-        pad_bottom_(pad_bottom),
-        pad_left_(pad_left),
-        pad_right_(pad_right),
+  Conv2dK3x3Winograd(const std::vector<int> paddings,
+                     const Padding padding_type)
+      : Conv2dBase({1, 1}, {1, 1}, paddings, padding_type),
+        gemm_(),
        transformed_filter_(nullptr),
        out_tile_size_(0) {}

@@ -48,9 +47,6 @@ class Conv2dK3x3Winograd : public Conv2dBase {
      Tensor *output);

 private:
-  void UnPackOutput(const Tensor &padded_output,
-                    Tensor *output);
-
  void TransformFilter4x4(const float *filter,
                          const index_t in_channels,
                          const index_t out_channels,
@@ -94,10 +90,6 @@ class Conv2dK3x3Winograd : public Conv2dBase {
                          float *output);

  Gemm gemm_;
-  int pad_top_;
-  int pad_bottom_;
-  int pad_left_;
-  int pad_right_;
  std::unique_ptr<Tensor> transformed_filter_;
  index_t out_tile_size_;
 };

--- a/mace/ops/arm/conv_2d_neon_5x5.cc
+++ b/mace/ops/arm/conv_2d_neon_5x5.cc
-// Copyright 2018 The MACE Authors. All Rights Reserved.
+// Copyright 2019 The MACE Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -12,14 +12,14 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

-#if defined(MACE_ENABLE_NEON)
 #include <arm_neon.h>
-#endif
-
-#include "mace/ops/arm/conv_2d_neon.h"
+#include <memory>
+#include "mace/ops/arm/fp32/conv_2d_5x5.h"

 namespace mace {
 namespace ops {
+namespace arm {
+namespace fp32 {

 #define MACE_Conv2dNeonK5x5SnLoadCalc4                    \
  /* load filter (4 outch x 1 height x 4 width) */        \
@@ -76,12 +76,40 @@ namespace ops {
  vo0 = vmlaq_lane_f32(vo0, vi3, vget_high_f32(vf00), 1); \
  vo0 = vmlaq_lane_f32(vo0, vi4, vf01, 1);

-// Ho = 1, Wo = 4, Co = 4
-void Conv2dNeonK5x5S1(const float *input,
-                      const float *filter,
-                      const index_t *in_shape,
-                      const index_t *out_shape,
-                      float *output) {
+MaceStatus Conv2dK5x5S1::Compute(const OpContext *context,
+                                 const Tensor *input,
+                                 const Tensor *filter,
+                                 Tensor *output) {
+  std::unique_ptr<const Tensor> padded_input;
+  std::unique_ptr<Tensor> padded_output;
+  ResizeOutAndPadInOut(context,
+                       input,
+                       filter,
+                       output,
+                       1,
+                       4,
+                       &padded_input,
+                       &padded_output);
+  const Tensor *in_tensor = input;
+  if (padded_input.get() != nullptr) {
+    in_tensor = padded_input.get();
+  }
+  Tensor *out_tensor = output;
+  if (padded_output.get() != nullptr) {
+    out_tensor = padded_output.get();
+  }
+  out_tensor->Clear();
+
+  Tensor::MappingGuard in_guard(input);
+  Tensor::MappingGuard filter_guard(filter);
+  Tensor::MappingGuard out_guard(output);
+  auto filter_data = filter->data<float>();
+  auto input_data = in_tensor->data<float>();
+  auto output_data = out_tensor->mutable_data<float>();
+
+  auto in_shape = in_tensor->shape();
+  auto out_shape = out_tensor->shape();
+
  const index_t in_image_size = in_shape[2] * in_shape[3];
  const index_t out_image_size = out_shape[2] * out_shape[3];
  const index_t in_batch_size = in_shape[1] * in_image_size;
@@ -96,26 +124,26 @@ void Conv2dNeonK5x5S1(const float *input,
      const index_t in_channels = in_shape[1];
      const index_t in_width = in_shape[3];
      if (m + 3 < out_channels) {
-        float *out_ptr0_base = output + b * out_batch_size + m * out_image_size;
-#if defined(MACE_ENABLE_NEON) && !defined(__aarch64__)
+        float *out_ptr0_base =
+            output_data + b * out_batch_size + m * out_image_size;
        float *out_ptr1_base =
-            output + b * out_batch_size + (m + 1) * out_image_size;
+            output_data + b * out_batch_size + (m + 1) * out_image_size;
        float *out_ptr2_base =
-            output + b * out_batch_size + (m + 2) * out_image_size;
+            output_data + b * out_batch_size + (m + 2) * out_image_size;
        float *out_ptr3_base =
-            output + b * out_batch_size + (m + 3) * out_image_size;
-#endif
+            output_data + b * out_batch_size + (m + 3) * out_image_size;
+
        for (index_t c = 0; c < in_channels; ++c) {
          const float *in_ptr_base =
-              input + b * in_batch_size + c * in_image_size;
-          const float *filter_ptr0 = filter + m * in_channels * 25 + c * 25;
-#if defined(MACE_ENABLE_NEON) && !defined(__aarch64__)
+              input_data + b * in_batch_size + c * in_image_size;
+          const float
+              *filter_ptr0 = filter_data + m * in_channels * 25 + c * 25;
          const float *filter_ptr1 =
-              filter + (m + 1) * in_channels * 25 + c * 25;
+              filter_data + (m + 1) * in_channels * 25 + c * 25;
          const float *filter_ptr2 =
-              filter + (m + 2) * in_channels * 25 + c * 25;
+              filter_data + (m + 2) * in_channels * 25 + c * 25;
          const float *filter_ptr3 =
-              filter + (m + 3) * in_channels * 25 + c * 25;
+              filter_data + (m + 3) * in_channels * 25 + c * 25;
          for (index_t h = 0; h < out_height; ++h) {
            for (index_t w = 0; w + 3 < out_width; w += 4) {
              // input offset
@@ -158,23 +186,16 @@ void Conv2dNeonK5x5S1(const float *input,
              filter_ptr3 -= 25;
            }  // w
          }    // h
-#else
-          for (index_t oc = 0; oc < 4; ++oc) {
-            Conv2dCPUKHxKWCalc(in_ptr_base, filter_ptr0 + oc * in_channels * 25,
-                               in_width, 5, 5, out_height, out_width,
-                               out_ptr0_base + oc * out_image_size, 1);
-          }
-#endif
        }  // c
      } else {
        for (index_t mm = m; mm < out_channels; ++mm) {
          float *out_ptr0_base =
-              output + b * out_batch_size + mm * out_image_size;
+              output_data + b * out_batch_size + mm * out_image_size;
          for (index_t c = 0; c < in_channels; ++c) {
            const float *in_ptr_base =
-                input + b * in_batch_size + c * in_image_size;
-            const float *filter_ptr0 = filter + mm * in_channels * 25 + c * 25;
-#if defined(MACE_ENABLE_NEON) && !defined(__aarch64__)
+                input_data + b * in_batch_size + c * in_image_size;
+            const float
+                *filter_ptr0 = filter_data + mm * in_channels * 25 + c * 25;
            for (index_t h = 0; h < out_height; ++h) {
              for (index_t w = 0; w + 3 < out_width; w += 4) {
                // input offset
@@ -204,16 +225,17 @@ void Conv2dNeonK5x5S1(const float *input,
                filter_ptr0 -= 25;
              }  // w
            }    // h
-#else
-            Conv2dCPUKHxKWCalc(in_ptr_base, filter_ptr0, in_width, 5, 5,
-                               out_height, out_width, out_ptr0_base, 1);
-#endif
          }  // c
        }    // mm
      }      // if
    }        // m
  }          // b
+
+  UnPadOutput(*out_tensor, output);
+  return MaceStatus::MACE_SUCCESS;
 }

+}  // namespace fp32
+}  // namespace arm
 }  // namespace ops
 }  // namespace mace
--- a/mace/ops/arm/fp32/conv_2d_5x5.h
+++ b/mace/ops/arm/fp32/conv_2d_5x5.h
+// Copyright 2019 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef MACE_OPS_ARM_FP32_CONV_2D_5X5_H_
+#define MACE_OPS_ARM_FP32_CONV_2D_5X5_H_
+
+#include <vector>
+#include "mace/public/mace.h"
+#include "mace/core/tensor.h"
+#include "mace/core/op_context.h"
+#include "mace/ops/arm/fp32/conv_2d.h"
+
+namespace mace {
+namespace ops {
+namespace arm {
+namespace fp32 {
+
+class Conv2dK5x5S1 : public Conv2dBase {
+ public:
+  Conv2dK5x5S1(const std::vector<int> paddings, const Padding padding_type)
+      : Conv2dBase({1, 1}, {1, 1}, paddings, padding_type) {}
+  virtual ~Conv2dK5x5S1() {}
+
+  MaceStatus Compute(
+      const OpContext *context,
+      const Tensor *input,
+      const Tensor *filter,
+      Tensor *output);
+};
+
+
+}  // namespace fp32
+}  // namespace arm
+}  // namespace ops
+}  // namespace mace
+
+#endif  // MACE_OPS_ARM_FP32_CONV_2D_5X5_H_
--- a/mace/ops/arm/conv_2d_neon_7x7.cc
+++ b/mace/ops/arm/conv_2d_neon_7x7.cc
-// Copyright 2018 The MACE Authors. All Rights Reserved.
+// Copyright 2019 The MACE Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -12,14 +12,14 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

-#if defined(MACE_ENABLE_NEON)
 #include <arm_neon.h>
-#endif
-
-#include "mace/ops/arm/conv_2d_neon.h"
+#include <memory>
+#include "mace/ops/arm/fp32/conv_2d_7x7.h"

 namespace mace {
 namespace ops {
+namespace arm {
+namespace fp32 {

 #define MACE_Conv2dArmv8NeonK7x7SnLoadCalc4        \
  /* load filter (4 outch x 1 height x 4 width) */ \
@@ -153,12 +153,40 @@ namespace ops {
  vo0 = vmlaq_lane_f32(vo0, vi5, vget_high_f32(vf01), 0); \
  vo0 = vmlaq_lane_f32(vo0, vi6, vget_high_f32(vf01), 1);

-// Ho = 1, Wo = 4, Co = 4
-void Conv2dNeonK7x7S1(const float *input,
-                      const float *filter,
-                      const index_t *in_shape,
-                      const index_t *out_shape,
-                      float *output) {
+MaceStatus Conv2dK7x7S1::Compute(const OpContext *context,
+                                 const Tensor *input,
+                                 const Tensor *filter,
+                                 Tensor *output) {
+  std::unique_ptr<const Tensor> padded_input;
+  std::unique_ptr<Tensor> padded_output;
+  ResizeOutAndPadInOut(context,
+                       input,
+                       filter,
+                       output,
+                       1,
+                       4,
+                       &padded_input,
+                       &padded_output);
+  const Tensor *in_tensor = input;
+  if (padded_input.get() != nullptr) {
+    in_tensor = padded_input.get();
+  }
+  Tensor *out_tensor = output;
+  if (padded_output.get() != nullptr) {
+    out_tensor = padded_output.get();
+  }
+  out_tensor->Clear();
+
+  Tensor::MappingGuard in_guard(input);
+  Tensor::MappingGuard filter_guard(filter);
+  Tensor::MappingGuard out_guard(output);
+  auto filter_data = filter->data<float>();
+  auto input_data = in_tensor->data<float>();
+  auto output_data = out_tensor->mutable_data<float>();
+
+  auto in_shape = in_tensor->shape();
+  auto out_shape = out_tensor->shape();
+
  const index_t in_image_size = in_shape[2] * in_shape[3];
  const index_t out_image_size = out_shape[2] * out_shape[3];
  const index_t in_batch_size = in_shape[1] * in_image_size;
@@ -173,26 +201,25 @@ void Conv2dNeonK7x7S1(const float *input,
      const index_t in_channels = in_shape[1];
      const index_t in_width = in_shape[3];
      if (m + 3 < out_channels) {
-        float *out_ptr0_base = output + b * out_batch_size + m * out_image_size;
-#if defined(MACE_ENABLE_NEON)
+        float *out_ptr0_base =
+            output_data + b * out_batch_size + m * out_image_size;
        float *out_ptr1_base =
-            output + b * out_batch_size + (m + 1) * out_image_size;
+            output_data + b * out_batch_size + (m + 1) * out_image_size;
        float *out_ptr2_base =
-            output + b * out_batch_size + (m + 2) * out_image_size;
+            output_data + b * out_batch_size + (m + 2) * out_image_size;
        float *out_ptr3_base =
-            output + b * out_batch_size + (m + 3) * out_image_size;
-#endif
+            output_data + b * out_batch_size + (m + 3) * out_image_size;
        for (index_t c = 0; c < in_channels; ++c) {
          const float *in_ptr_base =
-              input + b * in_batch_size + c * in_image_size;
-          const float *filter_ptr0 = filter + m * in_channels * 49 + c * 49;
-#if defined(MACE_ENABLE_NEON)
+              input_data + b * in_batch_size + c * in_image_size;
+          const float
+              *filter_ptr0 = filter_data + m * in_channels * 49 + c * 49;
          const float *filter_ptr1 =
-              filter + (m + 1) * in_channels * 49 + c * 49;
+              filter_data + (m + 1) * in_channels * 49 + c * 49;
          const float *filter_ptr2 =
-              filter + (m + 2) * in_channels * 49 + c * 49;
+              filter_data + (m + 2) * in_channels * 49 + c * 49;
          const float *filter_ptr3 =
-              filter + (m + 3) * in_channels * 49 + c * 49;
+              filter_data + (m + 3) * in_channels * 49 + c * 49;
          for (index_t h = 0; h < out_height; ++h) {
            for (index_t w = 0; w + 3 < out_width; w += 4) {
              // input offset
@@ -243,23 +270,16 @@ void Conv2dNeonK7x7S1(const float *input,
              filter_ptr3 -= 49;
            }  // w
          }    // h
-#else
-          for (index_t oc = 0; oc < 4; ++oc) {
-            Conv2dCPUKHxKWCalc(in_ptr_base, filter_ptr0 + oc * in_channels * 49,
-                               in_width, 7, 7, out_height, out_width,
-                               out_ptr0_base + oc * out_image_size, 1);
-          }
-#endif
        }  // c
      } else {
        for (index_t mm = m; mm < out_channels; ++mm) {
          float *out_ptr0_base =
-              output + b * out_batch_size + mm * out_image_size;
+              output_data + b * out_batch_size + mm * out_image_size;
          for (index_t c = 0; c < in_channels; ++c) {
            const float *in_ptr_base =
-                input + b * in_batch_size + c * in_image_size;
-            const float *filter_ptr0 = filter + mm * in_channels * 49 + c * 49;
-#if defined(MACE_ENABLE_NEON)
+                input_data + b * in_batch_size + c * in_image_size;
+            const float
+                *filter_ptr0 = filter_data + mm * in_channels * 49 + c * 49;
            for (index_t h = 0; h < out_height; ++h) {
              for (index_t w = 0; w + 3 < out_width; w += 4) {
                // input offset
@@ -297,23 +317,50 @@ void Conv2dNeonK7x7S1(const float *input,
                filter_ptr0 -= 49;
              }  // w
            }    // h
-#else
-            Conv2dCPUKHxKWCalc(in_ptr_base, filter_ptr0, in_width, 7, 7,
-                               out_height, out_width, out_ptr0_base, 1);
-#endif
          }  // c
        }    // mm
      }      // if
    }        // m
  }          // b
+
+  UnPadOutput(*out_tensor, output);
+  return MaceStatus::MACE_SUCCESS;
 }

-// Ho = 1, Wo = 4, Co = 4
-void Conv2dNeonK7x7S2(const float *input,
-                      const float *filter,
-                      const index_t *in_shape,
-                      const index_t *out_shape,
-                      float *output) {
+MaceStatus Conv2dK7x7S2::Compute(const OpContext *context,
+                                 const Tensor *input,
+                                 const Tensor *filter,
+                                 Tensor *output) {
+  std::unique_ptr<const Tensor> padded_input;
+  std::unique_ptr<Tensor> padded_output;
+  ResizeOutAndPadInOut(context,
+                       input,
+                       filter,
+                       output,
+                       1,
+                       4,
+                       &padded_input,
+                       &padded_output);
+  const Tensor *in_tensor = input;
+  if (padded_input.get() != nullptr) {
+    in_tensor = padded_input.get();
+  }
+  Tensor *out_tensor = output;
+  if (padded_output.get() != nullptr) {
+    out_tensor = padded_output.get();
+  }
+  out_tensor->Clear();
+
+  Tensor::MappingGuard in_guard(input);
+  Tensor::MappingGuard filter_guard(filter);
+  Tensor::MappingGuard out_guard(output);
+  auto filter_data = filter->data<float>();
+  auto input_data = in_tensor->data<float>();
+  auto output_data = out_tensor->mutable_data<float>();
+
+  auto in_shape = in_tensor->shape();
+  auto out_shape = out_tensor->shape();
+
  const index_t in_image_size = in_shape[2] * in_shape[3];
  const index_t out_image_size = out_shape[2] * out_shape[3];
  const index_t in_batch_size = in_shape[1] * in_image_size;
@@ -328,26 +375,25 @@ void Conv2dNeonK7x7S2(const float *input,
      const index_t in_channels = in_shape[1];
      const index_t in_width = in_shape[3];
      if (m + 3 < out_channels) {
-        float *out_ptr0_base = output + b * out_batch_size + m * out_image_size;
-#if defined(MACE_ENABLE_NEON)
+        float *out_ptr0_base =
+            output_data + b * out_batch_size + m * out_image_size;
        float *out_ptr1_base =
-            output + b * out_batch_size + (m + 1) * out_image_size;
+            output_data + b * out_batch_size + (m + 1) * out_image_size;
        float *out_ptr2_base =
-            output + b * out_batch_size + (m + 2) * out_image_size;
+            output_data + b * out_batch_size + (m + 2) * out_image_size;
        float *out_ptr3_base =
-            output + b * out_batch_size + (m + 3) * out_image_size;
-#endif
+            output_data + b * out_batch_size + (m + 3) * out_image_size;
        for (index_t c = 0; c < in_channels; ++c) {
          const float *in_ptr_base =
-              input + b * in_batch_size + c * in_image_size;
-          const float *filter_ptr0 = filter + m * in_channels * 49 + c * 49;
-#if defined(MACE_ENABLE_NEON)
+              input_data + b * in_batch_size + c * in_image_size;
+          const float
+              *filter_ptr0 = filter_data + m * in_channels * 49 + c * 49;
          const float *filter_ptr1 =
-              filter + (m + 1) * in_channels * 49 + c * 49;
+              filter_data + (m + 1) * in_channels * 49 + c * 49;
          const float *filter_ptr2 =
-              filter + (m + 2) * in_channels * 49 + c * 49;
+              filter_data + (m + 2) * in_channels * 49 + c * 49;
          const float *filter_ptr3 =
-              filter + (m + 3) * in_channels * 49 + c * 49;
+              filter_data + (m + 3) * in_channels * 49 + c * 49;
          for (index_t h = 0; h < out_height; ++h) {
            for (index_t w = 0; w + 3 < out_width; w += 4) {
              // input offset
@@ -403,23 +449,16 @@ void Conv2dNeonK7x7S2(const float *input,
              filter_ptr3 -= 49;
            }  // w
          }    // h
-#else
-          for (index_t oc = 0; oc < 4; ++oc) {
-            Conv2dCPUKHxKWCalc(in_ptr_base, filter_ptr0 + oc * in_channels * 49,
-                               in_width, 7, 7, out_height, out_width,
-                               out_ptr0_base + oc * out_image_size, 2);
-          }
-#endif
        }  // c
      } else {
        for (index_t mm = m; mm < out_channels; ++mm) {
          float *out_ptr0_base =
-              output + b * out_batch_size + mm * out_image_size;
+              output_data + b * out_batch_size + mm * out_image_size;
          for (index_t c = 0; c < in_channels; ++c) {
            const float *in_ptr_base =
-                input + b * in_batch_size + c * in_image_size;
-            const float *filter_ptr0 = filter + mm * in_channels * 49 + c * 49;
-#if defined(MACE_ENABLE_NEON)
+                input_data + b * in_batch_size + c * in_image_size;
+            const float
+                *filter_ptr0 = filter_data + mm * in_channels * 49 + c * 49;
            for (index_t h = 0; h < out_height; ++h) {
              for (index_t w = 0; w + 3 < out_width; w += 4) {
                // input offset
@@ -462,23 +501,50 @@ void Conv2dNeonK7x7S2(const float *input,
                filter_ptr0 -= 49;
              }  // w
            }    // h
-#else
-            Conv2dCPUKHxKWCalc(in_ptr_base, filter_ptr0, in_width, 7, 7,
-                               out_height, out_width, out_ptr0_base, 2);
-#endif
          }  // c
        }    // mm
      }      // if
    }        // m
  }          // b
+
+  UnPadOutput(*out_tensor, output);
+  return MaceStatus::MACE_SUCCESS;
 }

-// Ho = 1, Wo = 4, Co = 4
-void Conv2dNeonK7x7S3(const float *input,
-                      const float *filter,
-                      const index_t *in_shape,
-                      const index_t *out_shape,
-                      float *output) {
+MaceStatus Conv2dK7x7S3::Compute(const OpContext *context,
+                                 const Tensor *input,
+                                 const Tensor *filter,
+                                 Tensor *output) {
+  std::unique_ptr<const Tensor> padded_input;
+  std::unique_ptr<Tensor> padded_output;
+  ResizeOutAndPadInOut(context,
+                       input,
+                       filter,
+                       output,
+                       1,
+                       4,
+                       &padded_input,
+                       &padded_output);
+  const Tensor *in_tensor = input;
+  if (padded_input.get() != nullptr) {
+    in_tensor = padded_input.get();
+  }
+  Tensor *out_tensor = output;
+  if (padded_output.get() != nullptr) {
+    out_tensor = padded_output.get();
+  }
+  out_tensor->Clear();
+
+  Tensor::MappingGuard in_guard(input);
+  Tensor::MappingGuard filter_guard(filter);
+  Tensor::MappingGuard out_guard(output);
+  auto filter_data = filter->data<float>();
+  auto input_data = in_tensor->data<float>();
+  auto output_data = out_tensor->mutable_data<float>();
+
+  auto in_shape = in_tensor->shape();
+  auto out_shape = out_tensor->shape();
+
  const index_t in_image_size = in_shape[2] * in_shape[3];
  const index_t out_image_size = out_shape[2] * out_shape[3];
  const index_t in_batch_size = in_shape[1] * in_image_size;
@@ -493,26 +559,25 @@ void Conv2dNeonK7x7S3(const float *input,
      const index_t in_channels = in_shape[1];
      const index_t in_width = in_shape[3];
      if (m + 3 < out_channels) {
-        float *out_ptr0_base = output + b * out_batch_size + m * out_image_size;
-#if defined(MACE_ENABLE_NEON)
+        float *out_ptr0_base =
+            output_data + b * out_batch_size + m * out_image_size;
        float *out_ptr1_base =
-            output + b * out_batch_size + (m + 1) * out_image_size;
+            output_data + b * out_batch_size + (m + 1) * out_image_size;
        float *out_ptr2_base =
-            output + b * out_batch_size + (m + 2) * out_image_size;
+            output_data + b * out_batch_size + (m + 2) * out_image_size;
        float *out_ptr3_base =
-            output + b * out_batch_size + (m + 3) * out_image_size;
-#endif
+            output_data + b * out_batch_size + (m + 3) * out_image_size;
        for (index_t c = 0; c < in_channels; ++c) {
          const float *in_ptr_base =
-              input + b * in_batch_size + c * in_image_size;
-          const float *filter_ptr0 = filter + m * in_channels * 49 + c * 49;
-#if defined(MACE_ENABLE_NEON)
+              input_data + b * in_batch_size + c * in_image_size;
+          const float
+              *filter_ptr0 = filter_data + m * in_channels * 49 + c * 49;
          const float *filter_ptr1 =
-              filter + (m + 1) * in_channels * 49 + c * 49;
+              filter_data + (m + 1) * in_channels * 49 + c * 49;
          const float *filter_ptr2 =
-              filter + (m + 2) * in_channels * 49 + c * 49;
+              filter_data + (m + 2) * in_channels * 49 + c * 49;
          const float *filter_ptr3 =
-              filter + (m + 3) * in_channels * 49 + c * 49;
+              filter_data + (m + 3) * in_channels * 49 + c * 49;
          for (index_t h = 0; h < out_height; ++h) {
            for (index_t w = 0; w + 3 < out_width; w += 4) {
              // input offset
@@ -568,23 +633,16 @@ void Conv2dNeonK7x7S3(const float *input,
              filter_ptr3 -= 49;
            }  // w
          }    // h
-#else
-          for (index_t oc = 0; oc < 4; ++oc) {
-            Conv2dCPUKHxKWCalc(in_ptr_base, filter_ptr0 + oc * in_channels * 49,
-                               in_width, 7, 7, out_height, out_width,
-                               out_ptr0_base + oc * out_image_size, 3);
-          }
-#endif
        }  // c
      } else {
        for (index_t mm = m; mm < out_channels; ++mm) {
          float *out_ptr0_base =
-              output + b * out_batch_size + mm * out_image_size;
+              output_data + b * out_batch_size + mm * out_image_size;
          for (index_t c = 0; c < in_channels; ++c) {
            const float *in_ptr_base =
-                input + b * in_batch_size + c * in_image_size;
-            const float *filter_ptr0 = filter + mm * in_channels * 49 + c * 49;
-#if defined(MACE_ENABLE_NEON)
+                input_data + b * in_batch_size + c * in_image_size;
+            const float
+                *filter_ptr0 = filter_data + mm * in_channels * 49 + c * 49;
            for (index_t h = 0; h < out_height; ++h) {
              for (index_t w = 0; w + 3 < out_width; w += 4) {
                // input offset
@@ -627,16 +685,17 @@ void Conv2dNeonK7x7S3(const float *input,
                filter_ptr0 -= 49;
              }  // w
            }    // h
-#else
-            Conv2dCPUKHxKWCalc(in_ptr_base, filter_ptr0, in_width, 7, 7,
-                               out_height, out_width, out_ptr0_base, 3);
-#endif
          }  // c
        }    // mm
      }      // if
    }        // m
  }          // b
+
+  UnPadOutput(*out_tensor, output);
+  return MaceStatus::MACE_SUCCESS;
 }

+}  // namespace fp32
+}  // namespace arm
 }  // namespace ops
 }  // namespace mace
--- a/mace/ops/arm/fp32/conv_2d_7x7.h
+++ b/mace/ops/arm/fp32/conv_2d_7x7.h
+// Copyright 2019 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef MACE_OPS_ARM_FP32_CONV_2D_7X7_H_
+#define MACE_OPS_ARM_FP32_CONV_2D_7X7_H_
+
+#include <vector>
+#include "mace/public/mace.h"
+#include "mace/core/tensor.h"
+#include "mace/core/op_context.h"
+#include "mace/ops/arm/fp32/conv_2d.h"
+
+namespace mace {
+namespace ops {
+namespace arm {
+namespace fp32 {
+
+class Conv2dK7x7S1 : public Conv2dBase {
+ public:
+  Conv2dK7x7S1(const std::vector<int> paddings, const Padding padding_type)
+      : Conv2dBase({1, 1}, {1, 1}, paddings, padding_type) {}
+  virtual ~Conv2dK7x7S1() {}
+
+  MaceStatus Compute(
+      const OpContext *context,
+      const Tensor *input,
+      const Tensor *filter,
+      Tensor *output);
+};
+
+class Conv2dK7x7S2 : public Conv2dBase {
+ public:
+  Conv2dK7x7S2(const std::vector<int> paddings, const Padding padding_type)
+      : Conv2dBase({2, 2}, {1, 1}, paddings, padding_type) {}
+  virtual ~Conv2dK7x7S2() {}
+
+  MaceStatus Compute(
+      const OpContext *context,
+      const Tensor *input,
+      const Tensor *filter,
+      Tensor *output);
+};
+
+class Conv2dK7x7S3 : public Conv2dBase {
+ public:
+  Conv2dK7x7S3(const std::vector<int> paddings, const Padding padding_type)
+      : Conv2dBase({3, 3}, {1, 1}, paddings, padding_type) {}
+  virtual ~Conv2dK7x7S3() {}
+
+  MaceStatus Compute(
+      const OpContext *context,
+      const Tensor *input,
+      const Tensor *filter,
+      Tensor *output);
+};
+
+}  // namespace fp32
+}  // namespace arm
+}  // namespace ops
+}  // namespace mace
+
+#endif  // MACE_OPS_ARM_FP32_CONV_2D_7X7_H_
--- a/mace/ops/arm/fp32/conv_general.cc
+++ b/mace/ops/arm/fp32/conv_general.cc
+// Copyright 2019 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <memory>
+#include "mace/ops/arm/fp32/conv_general.h"
+
+namespace mace {
+namespace ops {
+namespace arm {
+namespace fp32 {
+
+MaceStatus Conv2dGeneral::Compute(const OpContext *context,
+                                  const Tensor *input,
+                                  const Tensor *filter,
+                                  Tensor *output) {
+  std::unique_ptr<const Tensor> padded_input;
+  std::unique_ptr<Tensor> padded_output;
+
+  ResizeOutAndPadInOut(context,
+                       input,
+                       filter,
+                       output,
+                       1,
+                       4,
+                       &padded_input,
+                       &padded_output);
+
+  const Tensor *in_tensor = input;
+  if (padded_input.get() != nullptr) {
+    in_tensor = padded_input.get();
+  }
+  Tensor *out_tensor = output;
+  if (padded_output.get() != nullptr) {
+    out_tensor = padded_output.get();
+  }
+  out_tensor->Clear();
+
+  Tensor::MappingGuard in_guard(input);
+  Tensor::MappingGuard filter_guard(filter);
+  Tensor::MappingGuard out_guard(output);
+  auto filter_data = filter->data<float>();
+  auto input_data = in_tensor->data<float>();
+  auto output_data = out_tensor->mutable_data<float>();
+
+  auto in_shape = in_tensor->shape();
+  auto out_shape = out_tensor->shape();
+  auto filter_shape = filter->shape();
+
+  const index_t in_image_size = in_shape[2] * in_shape[3];
+  const index_t out_image_size = out_shape[2] * out_shape[3];
+  const index_t in_batch_size = filter_shape[1] * in_image_size;
+  const index_t out_batch_size = filter_shape[0] * out_image_size;
+  const index_t filter_size = filter_shape[2] * filter_shape[3];
+
+#pragma omp parallel for collapse(2) schedule(runtime)
+  for (index_t b = 0; b < in_shape[0]; b++) {
+    for (index_t m = 0; m < filter_shape[0]; m += 4) {
+      const index_t in_width = in_shape[3];
+      const index_t out_height = out_shape[2];
+      const index_t out_width = out_shape[3];
+      const index_t out_channels = filter_shape[0];
+      const index_t in_channels = filter_shape[1];
+
+      const int stride_h = strides_[0];
+      const int stride_w = strides_[1];
+      const int dilation_h = dilations_[0];
+      const int dilation_w = dilations_[1];
+      if (m + 3 < out_channels) {
+        float *out_ptr0_base =
+            output_data + b * out_batch_size + m * out_image_size;
+        float *out_ptr1_base = out_ptr0_base + out_image_size;
+        float *out_ptr2_base = out_ptr1_base + out_image_size;
+        float *out_ptr3_base = out_ptr2_base + out_image_size;
+        for (index_t c = 0; c < in_channels; ++c) {
+          const float *in_ptr_base =
+              input_data + b * in_batch_size + c * in_image_size;
+          const float *filter_ptr0 =
+              filter_data + m * in_channels * filter_size + c * filter_size;
+          const float *filter_ptr1 = filter_ptr0 + in_channels * filter_size;
+          const float *filter_ptr2 = filter_ptr1 + in_channels * filter_size;
+          const float *filter_ptr3 = filter_ptr2 + in_channels * filter_size;
+          for (index_t h = 0; h < out_height; ++h) {
+            for (index_t w = 0; w + 3 < out_width; w += 4) {
+              // input offset
+              index_t ih = h * stride_h;
+              index_t iw = w * stride_w;
+              index_t in_offset = ih * in_width + iw;
+              // output (4 outch x 1 height x 4 width): vo_outch_height
+              float vo0[4], vo1[4], vo2[4], vo3[4];
+              // load output
+              index_t out_offset = h * out_width + w;
+              for (index_t ow = 0; ow < 4; ++ow) {
+                vo0[ow] = out_ptr0_base[out_offset + ow];
+                vo1[ow] = out_ptr1_base[out_offset + ow];
+                vo2[ow] = out_ptr2_base[out_offset + ow];
+                vo3[ow] = out_ptr3_base[out_offset + ow];
+              }
+              // calc by row
+              for (index_t kh = 0; kh < filter_shape[2]; ++kh) {
+                for (index_t kw = 0; kw < filter_shape[3]; ++kw) {
+                  // outch 0
+                  vo0[0] += in_ptr_base[in_offset
+                      + kw * dilation_w] * filter_ptr0[kw];
+                  vo0[1] += in_ptr_base[in_offset + stride_w
+                      + kw * dilation_w] * filter_ptr0[kw];
+                  vo0[2] += in_ptr_base[in_offset + 2 * stride_w
+                      + kw * dilation_w] * filter_ptr0[kw];
+                  vo0[3] += in_ptr_base[in_offset + 3 * stride_w
+                      + kw * dilation_w] * filter_ptr0[kw];
+                  // outch 1
+                  vo1[0] += in_ptr_base[in_offset
+                      + kw * dilation_w] * filter_ptr1[kw];
+                  vo1[1] += in_ptr_base[in_offset + stride_w
+                      + kw * dilation_w] * filter_ptr1[kw];
+                  vo1[2] += in_ptr_base[in_offset + 2 * stride_w
+                      + kw * dilation_w] * filter_ptr1[kw];
+                  vo1[3] += in_ptr_base[in_offset + 3 * stride_w
+                      + kw * dilation_w] * filter_ptr1[kw];
+                  // outch 2
+                  vo2[0] += in_ptr_base[in_offset
+                      + kw * dilation_w] * filter_ptr2[kw];
+                  vo2[1] += in_ptr_base[in_offset + stride_w
+                      + kw * dilation_w] * filter_ptr2[kw];
+                  vo2[2] += in_ptr_base[in_offset + 2 * stride_w
+                      + kw * dilation_w] * filter_ptr2[kw];
+                  vo2[3] += in_ptr_base[in_offset + 3 * stride_w
+                      + kw * dilation_w] * filter_ptr2[kw];
+                  // outch 3
+                  vo3[0] += in_ptr_base[in_offset
+                      + kw * dilation_w] * filter_ptr3[kw];
+                  vo3[1] += in_ptr_base[in_offset + stride_w
+                      + kw * dilation_w] * filter_ptr3[kw];
+                  vo3[2] += in_ptr_base[in_offset + 2 * stride_w
+                      + kw * dilation_w] * filter_ptr3[kw];
+                  vo3[3] += in_ptr_base[in_offset + 3 * stride_w
+                      + kw * dilation_w] * filter_ptr3[kw];
+                }  // kw
+
+                in_offset += dilation_h * in_width;
+                filter_ptr0 += filter_shape[3];
+                filter_ptr1 += filter_shape[3];
+                filter_ptr2 += filter_shape[3];
+                filter_ptr3 += filter_shape[3];
+              }  // kh
+
+              for (index_t ow = 0; ow < 4; ++ow) {
+                out_ptr0_base[out_offset + ow] = vo0[ow];
+                out_ptr1_base[out_offset + ow] = vo1[ow];
+                out_ptr2_base[out_offset + ow] = vo2[ow];
+                out_ptr3_base[out_offset + ow] = vo3[ow];
+              }
+
+              filter_ptr0 -= filter_size;
+              filter_ptr1 -= filter_size;
+              filter_ptr2 -= filter_size;
+              filter_ptr3 -= filter_size;
+            }  // w
+          }  // h
+        }  // c
+      } else {
+        for (index_t mm = m; mm < out_channels; ++mm) {
+          float *out_ptr0_base =
+              output_data + b * out_batch_size + mm * out_image_size;
+          for (index_t c = 0; c < in_channels; ++c) {
+            const float *in_ptr_base =
+                input_data + b * in_batch_size + c * in_image_size;
+            const float *filter_ptr0 =
+                filter_data + mm * in_channels * filter_size + c * filter_size;
+
+            for (index_t h = 0; h < out_height; ++h) {
+              for (index_t w = 0; w + 3 < out_width; w += 4) {
+                // input offset
+                index_t ih = h * stride_h;
+                index_t iw = w * stride_w;
+                index_t in_offset = ih * in_width + iw;
+                // output (1 outch x 1 height x 4 width): vo_outch_height
+                float vo0[4];
+                // load output
+                index_t out_offset = h * out_width + w;
+                for (index_t ow = 0; ow < 4; ++ow) {
+                  vo0[ow] = out_ptr0_base[out_offset + ow];
+                }
+
+                // calc by row
+                for (index_t kh = 0; kh < filter_shape[2]; ++kh) {
+                  for (index_t kw = 0; kw < filter_shape[3]; ++kw) {
+                    // outch 0
+                    vo0[0] += in_ptr_base[in_offset
+                        + kw * dilation_w] * filter_ptr0[kw];
+                    vo0[1] += in_ptr_base[in_offset + stride_w
+                        + kw * dilation_w] * filter_ptr0[kw];
+                    vo0[2] += in_ptr_base[in_offset + 2 * stride_w
+                        + kw * dilation_w] * filter_ptr0[kw];
+                    vo0[3] += in_ptr_base[in_offset + 3 * stride_w
+                        + kw * dilation_w] * filter_ptr0[kw];
+                  }  // kw
+
+                  in_offset += dilation_h * in_width;
+                  filter_ptr0 += filter_shape[3];
+                }  // kh
+
+                for (index_t ow = 0; ow < 4; ++ow) {
+                  out_ptr0_base[out_offset + ow] = vo0[ow];
+                }
+                filter_ptr0 -= filter_size;
+              }  // w
+            }  // h
+          }  // c
+        }  // mm
+      }  // if
+    }  // m
+  }  // b
+
+  UnPadOutput(*out_tensor, output);
+  return MaceStatus::MACE_SUCCESS;
+}
+
+}  // namespace fp32
+}  // namespace arm
+}  // namespace ops
+}  // namespace mace
--- a/mace/ops/arm/fp32/conv_general.h
+++ b/mace/ops/arm/fp32/conv_general.h
+// Copyright 2019 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef MACE_OPS_ARM_FP32_CONV_GENERAL_H_
+#define MACE_OPS_ARM_FP32_CONV_GENERAL_H_
+
+#include <vector>
+#include "mace/public/mace.h"
+#include "mace/core/tensor.h"
+#include "mace/core/op_context.h"
+#include "mace/ops/arm/fp32/conv_2d.h"
+
+namespace mace {
+namespace ops {
+namespace arm {
+namespace fp32 {
+
+class Conv2dGeneral : public Conv2dBase {
+ public:
+  Conv2dGeneral(const std::vector<int> strides,
+                const std::vector<int> dilations,
+                const std::vector<int> paddings,
+                const Padding padding_type)
+      : Conv2dBase(strides, dilations, paddings, padding_type) {}
+  virtual ~Conv2dGeneral() {}
+
+  MaceStatus Compute(
+      const OpContext *context,
+      const Tensor *input,
+      const Tensor *filter,
+      Tensor *output);
+};
+
+}  // namespace fp32
+}  // namespace arm
+}  // namespace ops
+}  // namespace mace
+
+#endif  // MACE_OPS_ARM_FP32_CONV_GENERAL_H_
--- a/mace/ops/arm/fp32/gemm.h
+++ b/mace/ops/arm/fp32/gemm.h
-// Copyright 2018 The MACE Authors. All Rights Reserved.
+// Copyright 2019 The MACE Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.

--- a/mace/ops/arm/fp32/gemv.h
+++ b/mace/ops/arm/fp32/gemv.h
-// Copyright 2018 The MACE Authors. All Rights Reserved.
+// Copyright 2019 The MACE Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.

--- a/mace/ops/arm/q8/eltwise.h
+++ b/mace/ops/arm/q8/eltwise.h
-// Copyright 2018 The MACE Authors. All Rights Reserved.
+// Copyright 2019 The MACE Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.

--- a/mace/ops/arm/q8/gemv.h
+++ b/mace/ops/arm/q8/gemv.h
-// Copyright 2018 The MACE Authors. All Rights Reserved.
+// Copyright 2019 The MACE Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.

--- a/mace/ops/conv_2d.cc
+++ b/mace/ops/conv_2d.cc
--- a/mace/ops/ref/conv_2d.cc
+++ b/mace/ops/ref/conv_2d.cc
@@ -16,7 +16,6 @@
 #include "mace/ops/ref/conv_2d.h"

 #include <vector>
-#include "mace/ops/common/conv_pool_2d_util.h"

 namespace mace {
 namespace ops {
@@ -30,31 +29,36 @@ MaceStatus Conv2d<float>::Compute(const OpContext *context,

  const std::vector<index_t> in_shape = input->shape();
  const std::vector<index_t> filter_shape = filter->shape();
-  const std::vector<index_t> out_shape = output->shape();
-  const std::vector<int> stride_hw{stride_h_, stride_w_};
-  const std::vector<int> dilation_hw{dilation_h_, dilation_w_};
-  const std::vector<int> paddings{pad_h_, pad_w_};
-  const index_t pad_top = pad_h_ >> 1;
-  const index_t pad_left = pad_w_ >> 1;
-
-  std::vector<index_t> output_shape(4);
-
-  CalcOutputSize(in_shape.data(),
-                 NCHW,
-                 filter_shape.data(),
-                 OIHW,
-                 paddings.data(),
-                 dilation_hw.data(),
-                 stride_hw.data(),
-                 RoundType::FLOOR,
-                 output_shape.data());
-  output->Resize(output_shape);
-
+  std::vector<index_t> out_shape(4);
+
+  std::vector<int> paddings(2);
+  if (paddings_.empty()) {
+    CalcNCHWPaddingAndOutputSize(input->shape().data(),
+                                 filter->shape().data(),
+                                 dilations_.data(),
+                                 strides_.data(),
+                                 padding_type_,
+                                 out_shape.data(),
+                                 paddings.data());
+  } else {
+    paddings = paddings_;
+    CalcNCHWOutputSize(input->shape().data(),
+                       filter->shape().data(),
+                       paddings_.data(),
+                       dilations_.data(),
+                       strides_.data(),
+                       RoundType::FLOOR,
+                       out_shape.data());
+  }
+  const index_t pad_top = paddings[0] >> 1;
+  const index_t pad_left = paddings[1] >> 1;
+  output->Resize(out_shape);
  const index_t in_image_size = in_shape[2] * in_shape[3];
  const index_t out_image_size = out_shape[2] * out_shape[3];
  const index_t in_batch_size = filter_shape[1] * in_image_size;
  const index_t out_batch_size = filter_shape[0] * out_image_size;
  const index_t filter_size = filter_shape[2] * filter_shape[3];
+
  Tensor::MappingGuard input_guard(input);
  Tensor::MappingGuard filter_guard(filter);
  Tensor::MappingGuard output_guard(output);
@@ -86,8 +90,10 @@ MaceStatus Conv2d<float>::Compute(const OpContext *context,

            for (index_t kh = 0; kh < filter_shape[2]; ++kh) {
              for (index_t kw = 0; kw < filter_shape[3]; ++kw) {
-                const index_t ih = -pad_top + h * stride_h_ + kh * dilation_h_;
-                const index_t iw = -pad_left + w * stride_w_ + kw * dilation_w_;
+                const index_t
+                    ih = -pad_top + h * strides_[0] + kh * dilations_[0];
+                const index_t
+                    iw = -pad_left + w * strides_[1] + kw * dilations_[1];
                if (ih >= 0 && ih < in_height && iw >= 0 && iw < in_width) {
                  sum += in_ptr_base[ih * in_width + iw] * filter_ptr[kw];
                }

--- a/mace/ops/ref/conv_2d.h
+++ b/mace/ops/ref/conv_2d.h
@@ -16,9 +16,12 @@
 #ifndef MACE_OPS_REF_CONV_2D_H_
 #define MACE_OPS_REF_CONV_2D_H_

+#include <vector>
+
 #include "mace/public/mace.h"
 #include "mace/core/tensor.h"
 #include "mace/core/op_context.h"
+#include "mace/ops/common/conv_pool_2d_util.h"

 namespace mace {
 namespace ops {
@@ -27,30 +30,39 @@ namespace ref {
 template<typename OUTPUT_TYPE>
 class Conv2d {
 public:
-  Conv2d(int stride_h, int stride_w, int dilation_h, int dilation_w);
+  Conv2d(const std::vector<int> strides,
+         const std::vector<int> dilations,
+         const std::vector<int> paddings,
+         const Padding padding_type)
+      : strides_(strides),
+        dilations_(dilations),
+        paddings_(paddings),
+        padding_type_(padding_type) {}
  ~Conv2d() {}
  MaceStatus Compute(
      const OpContext *context,
      const Tensor *input,
      const Tensor *filter,
      Tensor *output);
+
+ private:
+  const std::vector<int> strides_;
+  const std::vector<int> dilations_;
+  const std::vector<int> paddings_;
+  const Padding padding_type_;
 };

 template<>
 class Conv2d<float> {
 public:
-  Conv2d(int pad_h,
-         int pad_w,
-         int stride_h,
-         int stride_w,
-         int dilation_h,
-         int dilation_w)
-      : pad_h_(pad_h),
-        pad_w_(pad_w),
-        stride_h_(stride_h),
-        stride_w_(stride_w),
-        dilation_h_(dilation_h),
-        dilation_w_(dilation_w) {}
+  Conv2d(const std::vector<int> strides,
+         const std::vector<int> dilations,
+         const std::vector<int> paddings,
+         const Padding padding_type)
+      : strides_(strides),
+        dilations_(dilations),
+        paddings_(paddings),
+        padding_type_(padding_type) {}
  ~Conv2d() {}
  // Always row-major after transpose
  MaceStatus Compute(
@@ -60,12 +72,10 @@ class Conv2d<float> {
      Tensor *output);

 private:
-  int pad_h_;
-  int pad_w_;
-  int stride_h_;
-  int stride_w_;
-  int dilation_h_;
-  int dilation_w_;
+  const std::vector<int> strides_;
+  const std::vector<int> dilations_;
+  const std::vector<int> paddings_;
+  const Padding padding_type_;
 };

 }  // namespace ref

--- a/mace/python/tools/converter_tool/transformer.py
+++ b/mace/python/tools/converter_tool/transformer.py
@@ -1969,7 +1969,8 @@ class Transformer(base_converter.ConverterInterface):
            else:
                print("Quantize op %s (%s)" % (op.name, op.type))

-            non_zero = self._option.device == DeviceType.CPU.value
+            non_zero = self._option.device == DeviceType.CPU.value \
+                and op.type == MaceOp.MatMul.name

            for idx, input_tensor in enumerate(op.input):
                quantized_inputs_names.append(input_tensor)