Merge branch 'optimize_deconv_cpu' into 'master'

optimize deconv cpu See merge request !797

Merge branch 'optimize_deconv_cpu' into 'master'
optimize deconv cpu See merge request !797
c2523ee2 · 刘托 · e2a40a03 · ce1594a6 · c2523ee2 · c2523ee2
8 changed file
--- a/mace/examples/android/gradle/wrapper/gradle-wrapper.properties
+++ b/mace/examples/android/gradle/wrapper/gradle-wrapper.properties
-#Wed May 02 11:53:35 CST 2018
+#Wed Sep 12 13:35:24 CST 2018
 distributionBase=GRADLE_USER_HOME
 distributionPath=wrapper/dists
 zipStoreBase=GRADLE_USER_HOME

--- a/mace/kernels/arm/deconv_2d_neon.h
+++ b/mace/kernels/arm/deconv_2d_neon.h
+// Copyright 2018 Xiaomi, Inc.  All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef MACE_KERNELS_ARM_DECONV_2D_NEON_H_
+#define MACE_KERNELS_ARM_DECONV_2D_NEON_H_
+
+#if defined(MACE_ENABLE_NEON)
+#include <arm_neon.h>
+#endif
+
+#include "mace/core/types.h"
+
+namespace mace {
+namespace kernels {
+
+void Deconv2dNeonK3x3S1(const float *input,
+                        const float *filter,
+                        const float *bias,
+                        const index_t *in_shape,
+                        const index_t *out_shape,
+                        float *output);
+
+void Deconv2dNeonK3x3S2(const float *input,
+                        const float *filter,
+                        const float *bias,
+                        const index_t *in_shape,
+                        const index_t *out_shape,
+                        float *output);
+
+void Deconv2dNeonK4x4S1(const float *input,
+                        const float *filter,
+                        const float *bias,
+                        const index_t *in_shape,
+                        const index_t *out_shape,
+                        float *output);
+
+void Deconv2dNeonK4x4S2(const float *input,
+                        const float *filter,
+                        const float *bias,
+                        const index_t *in_shape,
+                        const index_t *out_shape,
+                        float *output);
+
+#ifdef MACE_ENABLE_NEON
+inline float32x4_t neon_vfma_lane_0(float32x4_t a,
+                          float32x4_t b,
+                          float32x4_t c) {
+#ifdef __aarch64__
+  return vfmaq_laneq_f32(a, b, c, 0);
+#else
+  return vmlaq_lane_f32(a, b, vget_low_f32(c), 0);
+#endif
+}
+
+inline float32x4_t neon_vfma_lane_1(float32x4_t a,
+                          float32x4_t b,
+                          float32x4_t c) {
+#ifdef __aarch64__
+  return vfmaq_laneq_f32(a, b, c, 1);
+#else
+  return vmlaq_lane_f32(a, b, vget_low_f32(c), 1);
+#endif
+}
+
+inline float32x4_t neon_vfma_lane_2(float32x4_t a,
+                          float32x4_t b,
+                          float32x4_t c) {
+#ifdef __aarch64__
+  return vfmaq_laneq_f32(a, b, c, 2);
+#else
+  return vmlaq_lane_f32(a, b, vget_high_f32(c), 0);
+#endif
+}
+
+inline float32x4_t neon_vfma_lane_3(float32x4_t a,
+                          float32x4_t b,
+                          float32x4_t c) {
+#ifdef __aarch64__
+  return vfmaq_laneq_f32(a, b, c, 3);
+#else
+  return vmlaq_lane_f32(a, b, vget_high_f32(c), 1);
+#endif
+}
+#endif
+
+}  // namespace kernels
+}  // namespace mace
+
+#endif  // MACE_KERNELS_ARM_DECONV_2D_NEON_H_
--- a/mace/kernels/arm/deconv_2d_neon_3x3.cc
+++ b/mace/kernels/arm/deconv_2d_neon_3x3.cc
+// Copyright 2018 Xiaomi, Inc.  All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "mace/core/macros.h"
+#include "mace/kernels/arm/deconv_2d_neon.h"
+
+namespace mace {
+namespace kernels {
+
+void Deconv2dNeonK3x3S1(const float *input,
+                        const float *filter,
+                        const float *bias,
+                        const index_t *in_shape,
+                        const index_t *out_shape,
+                        float *output) {
+  const index_t inch = in_shape[1];
+  const index_t h = in_shape[2];
+  const index_t w = in_shape[3];
+
+  const index_t outch = out_shape[1];
+  const index_t outh = out_shape[2];
+  const index_t outw = out_shape[3];
+
+  const index_t out_img_size = outh * outw;
+
+#pragma omp parallel for collapse(2)
+  for (index_t b = 0; b < out_shape[0]; ++b) {
+    for (index_t oc = 0; oc < outch; oc += 2) {
+      if (oc + 1 < outch) {
+        float *out_base0 = output + (b * outch + oc) * out_img_size;
+        float *out_base1 = out_base0 + out_img_size;
+
+        const float bias_value0 = bias ? bias[oc] : 0.f;
+        const float bias_value1 = bias ? bias[oc + 1] : 0.f;
+        std::fill_n(out_base0, out_img_size, bias_value0);
+        std::fill_n(out_base1, out_img_size, bias_value1);
+
+        for (index_t ic = 0; ic < inch; ++ic) {
+          const float *input_base = input + (b * inch + ic) * h * w;
+          const float *kernel_base0 = filter + (oc * inch + ic) * 9;
+          const float *kernel_base1 = kernel_base0 + inch * 9;
+          const float *in = input_base;
+
+          // output channel 0
+          const float *k0_0 = kernel_base0;
+          const float *k0_1 = kernel_base0 + 3;
+          const float *k0_2 = kernel_base0 + 5;
+          // output channel 1
+          const float *k1_0 = kernel_base1;
+          const float *k1_1 = kernel_base1 + 3;
+          const float *k1_2 = kernel_base1 + 5;
+
+#if defined(MACE_ENABLE_NEON)
+          // load filter
+          float32x4_t k00_vec, k01_vec, k02_vec;
+          float32x4_t k10_vec, k11_vec, k12_vec;
+
+          k00_vec = vld1q_f32(k0_0);
+          k01_vec = vld1q_f32(k0_1);
+          k02_vec = vld1q_f32(k0_2);
+
+          k10_vec = vld1q_f32(k1_0);
+          k11_vec = vld1q_f32(k1_1);
+          k12_vec = vld1q_f32(k1_2);
+#endif
+          for (index_t i = 0; i < h; ++i) {
+            float *out_row_base0 = out_base0 + i * outw;
+            float *out_row0_0 = out_row_base0;
+            float *out_row0_1 = out_row_base0 + outw;
+            float *out_row0_2 = out_row_base0 + 2 * outw;
+
+            float *out_row_base1 = out_base1 + i * outw;
+            float *out_row1_0 = out_row_base1;
+            float *out_row1_1 = out_row_base1 + outw;
+            float *out_row1_2 = out_row_base1 + 2 * outw;
+
+            index_t j = 0;
+#if defined(MACE_ENABLE_NEON)
+            for (; j + 3 < w; j += 4) {
+              float32x4_t in_vec = vld1q_f32(in);
+
+              float32x4_t out00, out01, out02;
+              float32x4_t out10, out11, out12;
+              float32x4_t out20, out21, out22;
+
+              out00 = vld1q_f32(out_row0_0);
+              out00 = neon_vfma_lane_0(out00, in_vec, k00_vec);
+              vst1q_f32(out_row0_0, out00);
+
+              out01 = vld1q_f32(out_row0_0 + 1);
+              out01 = neon_vfma_lane_1(out01, in_vec, k00_vec);
+              vst1q_f32(out_row0_0 + 1, out01);
+
+              out02 = vld1q_f32(out_row0_0 + 2);
+              out02 = neon_vfma_lane_2(out02, in_vec, k00_vec);
+              vst1q_f32(out_row0_0 + 2, out02);
+
+              out10 = vld1q_f32(out_row0_1 + 0);
+              out10 = neon_vfma_lane_0(out10, in_vec, k01_vec);
+              vst1q_f32(out_row0_1 + 0, out10);
+
+              out11 = vld1q_f32(out_row0_1 + 1);
+              out11 = neon_vfma_lane_1(out11, in_vec, k01_vec);
+              vst1q_f32(out_row0_1 + 1, out11);
+
+              out12 = vld1q_f32(out_row0_1 + 2);
+              out12 = neon_vfma_lane_2(out12, in_vec, k01_vec);
+              vst1q_f32(out_row0_1 + 2, out12);
+
+              out20 = vld1q_f32(out_row0_2 + 0);
+              out20 = neon_vfma_lane_1(out20, in_vec, k02_vec);
+              vst1q_f32(out_row0_2 + 0, out20);
+
+              out21 = vld1q_f32(out_row0_2 + 1);
+              out21 = neon_vfma_lane_2(out21, in_vec, k02_vec);
+              vst1q_f32(out_row0_2 + 1, out21);
+
+              out22 = vld1q_f32(out_row0_2 + 2);
+              out22 = neon_vfma_lane_3(out22, in_vec, k02_vec);
+              vst1q_f32(out_row0_2 + 2, out22);
+
+              out00 = vld1q_f32(out_row1_0 + 0);
+              out00 = neon_vfma_lane_0(out00, in_vec, k10_vec);
+              vst1q_f32(out_row1_0 + 0, out00);
+
+              out01 = vld1q_f32(out_row1_0 + 1);
+              out01 = neon_vfma_lane_1(out01, in_vec, k10_vec);
+              vst1q_f32(out_row1_0 + 1, out01);
+
+              out02 = vld1q_f32(out_row1_0 + 2);
+              out02 = neon_vfma_lane_2(out02, in_vec, k10_vec);
+              vst1q_f32(out_row1_0 + 2, out02);
+
+              out10 = vld1q_f32(out_row1_1 + 0);
+              out10 = neon_vfma_lane_0(out10, in_vec, k11_vec);
+              vst1q_f32(out_row1_1 + 0, out10);
+
+              out11 = vld1q_f32(out_row1_1 + 1);
+              out11 = neon_vfma_lane_1(out11, in_vec, k11_vec);
+              vst1q_f32(out_row1_1 + 1, out11);
+
+              out12 = vld1q_f32(out_row1_1 + 2);
+              out12 = neon_vfma_lane_2(out12, in_vec, k11_vec);
+              vst1q_f32(out_row1_1 + 2, out12);
+
+              out20 = vld1q_f32(out_row1_2 + 0);
+              out20 = neon_vfma_lane_1(out20, in_vec, k12_vec);
+              vst1q_f32(out_row1_2 + 0, out20);
+
+              out21 = vld1q_f32(out_row1_2 + 1);
+              out21 = neon_vfma_lane_2(out21, in_vec, k12_vec);
+              vst1q_f32(out_row1_2 + 1, out21);
+
+              out22 = vld1q_f32(out_row1_2 + 2);
+              out22 = neon_vfma_lane_3(out22, in_vec, k12_vec);
+              vst1q_f32(out_row1_2 + 2, out22);
+
+              in += 4;
+              out_row0_0 += 4;
+              out_row0_1 += 4;
+              out_row0_2 += 4;
+              out_row1_0 += 4;
+              out_row1_1 += 4;
+              out_row1_2 += 4;
+            }
+#endif
+            for (; j < w; ++j) {
+              float val = in[0];
+              for (int k = 0; k < 3; ++k) {
+                out_row0_0[k] += val * k0_0[k];
+                out_row0_1[k] += val * k0_1[k];
+                out_row0_2[k] += val * k0_2[k + 1];
+                out_row1_0[k] += val * k1_0[k];
+                out_row1_1[k] += val * k1_1[k];
+                out_row1_2[k] += val * k1_2[k + 1];
+              }
+              in++;
+              out_row0_0++;
+              out_row0_1++;
+              out_row0_2++;
+              out_row1_0++;
+              out_row1_1++;
+              out_row1_2++;
+            }
+          }
+        }
+      } else {
+        float *out_base0 = output + (b * outch + oc) * outh * outw;
+        const float bias_value0 = bias ? bias[oc] : 0.f;
+        std::fill_n(out_base0, outh * outw, bias_value0);
+        for (index_t ic = 0; ic < inch; ++ic) {
+          const float *input_base = input + (b * inch + ic) * h * w;
+          const float *kernel_base0 = filter + (oc * inch + ic) * 9;
+          const float *in = input_base;
+          const float *k0_0 = kernel_base0;
+          const float *k0_1 = kernel_base0 + 3;
+          const float *k0_2 = kernel_base0 + 5;
+
+#if defined(MACE_ENABLE_NEON)
+          // load filter
+          float32x4_t k00_vec = vld1q_f32(k0_0);
+          float32x4_t k01_vec = vld1q_f32(k0_1);
+          float32x4_t k02_vec = vld1q_f32(k0_2);
+#endif
+          for (index_t i = 0; i < h; ++i) {
+            float *out_row_base0 = out_base0 + i * outw;
+            float *out_row0_0 = out_row_base0;
+            float *out_row0_1 = out_row_base0 + outw;
+            float *out_row0_2 = out_row_base0 + 2 * outw;
+            index_t j = 0;
+#if defined(MACE_ENABLE_NEON)
+            for (; j + 3 < w; j += 4) {
+              float32x4_t in_vec = vld1q_f32(in);
+
+              float32x4_t out00, out01, out02;
+              float32x4_t out10, out11, out12;
+              float32x4_t out20, out21, out22;
+
+              out00 = vld1q_f32(out_row0_0 + 0);
+              out00 = neon_vfma_lane_0(out00, in_vec, k00_vec);
+              vst1q_f32(out_row0_0 + 0, out00);
+
+              out01 = vld1q_f32(out_row0_0 + 1);
+              out01 = neon_vfma_lane_1(out01, in_vec, k00_vec);
+              vst1q_f32(out_row0_0 + 1, out01);
+
+              out02 = vld1q_f32(out_row0_0 + 2);
+              out02 = neon_vfma_lane_2(out02, in_vec, k00_vec);
+              vst1q_f32(out_row0_0 + 2, out02);
+
+              out10 = vld1q_f32(out_row0_1 + 0);
+              out10 = neon_vfma_lane_0(out10, in_vec, k01_vec);
+              vst1q_f32(out_row0_1 + 0, out10);
+
+              out11 = vld1q_f32(out_row0_1 + 1);
+              out11 = neon_vfma_lane_1(out11, in_vec, k01_vec);
+              vst1q_f32(out_row0_1 + 1, out11);
+
+              out12 = vld1q_f32(out_row0_1 + 2);
+              out12 = neon_vfma_lane_2(out12, in_vec, k01_vec);
+              vst1q_f32(out_row0_1 + 2, out12);
+
+              out20 = vld1q_f32(out_row0_2 + 0);
+              out20 = neon_vfma_lane_1(out20, in_vec, k02_vec);
+              vst1q_f32(out_row0_2 + 0, out20);
+
+              out21 = vld1q_f32(out_row0_2 + 1);
+              out21 = neon_vfma_lane_2(out21, in_vec, k02_vec);
+              vst1q_f32(out_row0_2 + 1, out21);
+
+              out22 = vld1q_f32(out_row0_2 + 2);
+              out22 = neon_vfma_lane_3(out22, in_vec, k02_vec);
+              vst1q_f32(out_row0_2 + 2, out22);
+
+              in += 4;
+              out_row0_0 += 4;
+              out_row0_1 += 4;
+              out_row0_2 += 4;
+            }
+#endif
+            for (; j < w; ++j) {
+              float val = in[0];
+              for (int k = 0; k < 3; ++k) {
+                out_row0_0[k] += val * k0_0[k];
+                out_row0_1[k] += val * k0_1[k];
+                out_row0_2[k] += val * k0_2[k + 1];
+              }
+              in++;
+              out_row0_0++;
+              out_row0_1++;
+              out_row0_2++;
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+void Deconv2dNeonK3x3S2(const float *input,
+                        const float *filter,
+                        const float *bias,
+                        const index_t *in_shape,
+                        const index_t *out_shape,
+                        float *output) {
+  const index_t inch = in_shape[1];
+  const index_t h = in_shape[2];
+  const index_t w = in_shape[3];
+
+  const index_t outch = out_shape[1];
+  const index_t outh = out_shape[2];
+  const index_t outw = out_shape[3];
+  const index_t out_img_size = outh * outw;
+
+#pragma omp parallel for collapse(2)
+  for (index_t b = 0; b < out_shape[0]; ++b) {
+    for (index_t oc = 0; oc < outch; ++oc) {
+      float *out_base = output + (b * outch + oc) * out_img_size;
+
+      const float bias_value = bias ? bias[oc] : 0.f;
+      std::fill_n(out_base, out_img_size, bias_value);
+
+      for (index_t ic = 0; ic < inch; ++ic) {
+        const float *input_base = input + (b * inch + ic) * h * w;
+        const float *kernel_base = filter + (oc * inch + ic) * 9;
+        const float *in = input_base;
+
+        const float *k0 = kernel_base;
+        const float *k1 = kernel_base + 3;
+        const float *k2 = kernel_base + 5;
+
+#if defined(MACE_ENABLE_NEON)
+        float32x4_t k0_vec = vld1q_f32(k0);
+        float32x4_t k1_vec = vld1q_f32(k1);
+        float32x4_t k2_vec = vld1q_f32(k2);
+#endif
+        for (index_t i = 0; i < h; ++i) {
+          float *out_row_base = out_base + i * 2 * outw;
+          float *out_row_0 = out_row_base;
+          float *out_row_1 = out_row_0 + outw;
+          float *out_row_2 = out_row_1 + outw;
+
+          index_t j = 0;
+#if defined(MACE_ENABLE_NEON)
+          for (; j + 3 < w; j += 4) {
+            float32x4_t in_vec = vld1q_f32(in);
+
+            // out row 0
+            float32x4x2_t out00 = vld2q_f32(out_row_0);
+            out00.val[0] =
+              neon_vfma_lane_0(out00.val[0], in_vec, k0_vec);
+            out00.val[1] =
+              neon_vfma_lane_1(out00.val[1], in_vec, k0_vec);
+            vst2q_f32(out_row_0, out00);
+
+            float32x4x2_t out01 = vld2q_f32(out_row_0 + 2);
+            out01.val[0] =
+              neon_vfma_lane_2(out01.val[0], in_vec, k0_vec);
+            vst2q_f32(out_row_0 + 2, out01);
+
+            // out row 1
+            float32x4x2_t out10 = vld2q_f32(out_row_1);
+            out10.val[0] =
+              neon_vfma_lane_0(out10.val[0], in_vec, k1_vec);
+            out10.val[1] =
+              neon_vfma_lane_1(out10.val[1], in_vec, k1_vec);
+            vst2q_f32(out_row_1, out10);
+
+            float32x4x2_t out11 = vld2q_f32(out_row_1 + 2);
+            out11.val[0] =
+              neon_vfma_lane_2(out11.val[0], in_vec, k1_vec);
+            vst2q_f32(out_row_1 + 2, out11);
+
+            // out row 2
+            float32x4x2_t out20 = vld2q_f32(out_row_2);
+            out20.val[0] =
+              neon_vfma_lane_1(out20.val[0], in_vec, k2_vec);
+            out20.val[1] =
+              neon_vfma_lane_2(out20.val[1], in_vec, k2_vec);
+            vst2q_f32(out_row_2, out20);
+
+            float32x4x2_t out21 = vld2q_f32(out_row_2 + 2);
+            out21.val[0] =
+              neon_vfma_lane_3(out21.val[0], in_vec, k2_vec);
+            vst2q_f32(out_row_2 + 2, out21);
+
+            in += 4;
+            out_row_0 += 8;
+            out_row_1 += 8;
+            out_row_2 += 8;
+          }
+#endif
+          for (; j < w; ++j) {
+            float val = in[0];
+
+            for (int k = 0; k < 3; ++k) {
+              out_row_0[k] += val * k0[k];
+              out_row_1[k] += val * k1[k];
+              out_row_2[k] += val * k2[k + 1];
+            }
+
+            in++;
+            out_row_0 += 2;
+            out_row_1 += 2;
+            out_row_2 += 2;
+          }
+        }
+      }
+    }
+  }
+}
+
+}  // namespace kernels
+}  // namespace mace
--- a/mace/kernels/arm/deconv_2d_neon_4x4.cc
+++ b/mace/kernels/arm/deconv_2d_neon_4x4.cc
+// Copyright 2018 Xiaomi, Inc.  All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "mace/core/macros.h"
+#include "mace/kernels/arm/deconv_2d_neon.h"
+
+namespace mace {
+namespace kernels {
+
+void Deconv2dNeonK4x4S1(const float *input,
+                        const float *filter,
+                        const float *bias,
+                        const index_t *in_shape,
+                        const index_t *out_shape,
+                        float *output) {
+  const index_t w = in_shape[3];
+  const index_t h = in_shape[2];
+  const index_t inch = in_shape[1];
+
+  const index_t outh = out_shape[2];
+  const index_t outw = out_shape[3];
+  const index_t outch = out_shape[1];
+  const index_t out_img_size = outh * outw;
+#pragma omp parallel for
+  for (int b = 0; b < out_shape[0]; ++b) {
+    for (int oc = 0; oc < outch; oc += 2) {
+      if (oc + 1 < outch) {
+        float *out_base = output + (b * outch + oc) * out_img_size;
+        float *out_base1 = out_base + out_img_size;
+        const float bias_value = bias ? bias[oc] : 0.f;
+        std::fill_n(out_base, out_img_size, bias_value);
+        const float bias_value1 = bias ? bias[oc + 1] : 0.f;
+        std::fill_n(out_base1, out_img_size, bias_value1);
+        for (int q = 0; q < inch; q++) {
+          const float *input_base = input + (b * inch + q) * h * w;
+          const float *in = input_base;
+          const float *kernel_base = filter + (oc * inch + q) * 16;
+          const float *k0 = kernel_base;
+          const float *k1 = kernel_base + 4;
+          const float *k2 = kernel_base + 8;
+          const float *k3 = kernel_base + 12;
+
+          const float *kernel_base1 = kernel_base + inch * 16;
+          const float *k10 = kernel_base1;
+          const float *k11 = kernel_base1 + 4;
+          const float *k12 = kernel_base1 + 8;
+          const float *k13 = kernel_base1 + 12;
+#if defined(MACE_ENABLE_NEON)
+          float32x4_t k0_vec = vld1q_f32(k0);
+          float32x4_t k1_vec = vld1q_f32(k1);
+          float32x4_t k2_vec = vld1q_f32(k2);
+          float32x4_t k3_vec = vld1q_f32(k3);
+
+          float32x4_t k10_vec = vld1q_f32(k10);
+          float32x4_t k11_vec = vld1q_f32(k11);
+          float32x4_t k12_vec = vld1q_f32(k12);
+          float32x4_t k13_vec = vld1q_f32(k13);
+#endif
+          for (int i = 0; i < h; i++) {
+            float *out_row = out_base + i * outw;
+
+            float *out_row_0 = out_row;
+            float *out_row_1 = out_row_0 + outw;
+            float *out_row_2 = out_row_1 + outw;
+            float *out_row_3 = out_row_2 + outw;
+
+            float *out_row1 = out_base1 + i * outw;
+
+            float *out_row1_0 = out_row1;
+            float *out_row1_1 = out_row1_0 + outw;
+            float *out_row1_2 = out_row1_1 + outw;
+            float *out_row1_3 = out_row1_2 + outw;
+
+            int j = 0;
+#if defined(MACE_ENABLE_NEON)
+            for (; j + 3 < w; j += 4) {
+              float32x4_t in_vec = vld1q_f32(in);
+              float32x4_t out00, out01, out02, out03;
+              float32x4_t out10, out11, out12, out13;
+
+              out00 = vld1q_f32(out_row_0);
+              out00 = neon_vfma_lane_0(out00, in_vec, k0_vec);
+              vst1q_f32(out_row_0, out00);
+
+              out10 = vld1q_f32(out_row1_0);
+              out10 = neon_vfma_lane_0(out10, in_vec, k10_vec);
+              vst1q_f32(out_row1_0, out10);
+
+              out01 = vld1q_f32(out_row_0 + 1);
+              out01 = neon_vfma_lane_1(out01, in_vec, k0_vec);
+              vst1q_f32(out_row_0 + 1, out01);
+
+              out11 = vld1q_f32(out_row1_0 + 1);
+              out11 = neon_vfma_lane_1(out11, in_vec, k10_vec);
+              vst1q_f32(out_row1_0 + 1, out11);
+
+              out02 = vld1q_f32(out_row_0 + 2);
+              out02 = neon_vfma_lane_2(out02, in_vec, k0_vec);
+              vst1q_f32(out_row_0 + 2, out02);
+
+              out12 = vld1q_f32(out_row1_0 + 2);
+              out12 = neon_vfma_lane_2(out12, in_vec, k10_vec);
+              vst1q_f32(out_row1_0 + 2, out12);
+
+              out03 = vld1q_f32(out_row_0 + 3);
+              out03 = neon_vfma_lane_3(out03, in_vec, k0_vec);
+              vst1q_f32(out_row_0 + 3, out03);
+
+              out13 = vld1q_f32(out_row1_0 + 3);
+              out13 = neon_vfma_lane_3(out13, in_vec, k10_vec);
+              vst1q_f32(out_row1_0 + 3, out13);
+
+              //
+              out00 = vld1q_f32(out_row_1);
+              out00 = neon_vfma_lane_0(out00, in_vec, k1_vec);
+              vst1q_f32(out_row_1, out00);
+
+              out10 = vld1q_f32(out_row1_1);
+              out10 = neon_vfma_lane_0(out10, in_vec, k11_vec);
+              vst1q_f32(out_row1_1, out10);
+
+              out01 = vld1q_f32(out_row_1 + 1);
+              out01 = neon_vfma_lane_1(out01, in_vec, k1_vec);
+              vst1q_f32(out_row_1 + 1, out01);
+
+              out11 = vld1q_f32(out_row1_1 + 1);
+              out11 = neon_vfma_lane_1(out11, in_vec, k11_vec);
+              vst1q_f32(out_row1_1 + 1, out11);
+
+              out02 = vld1q_f32(out_row_1 + 2);
+              out02 = neon_vfma_lane_2(out02, in_vec, k1_vec);
+              vst1q_f32(out_row_1 + 2, out02);
+
+              out12 = vld1q_f32(out_row1_1 + 2);
+              out12 = neon_vfma_lane_2(out12, in_vec, k11_vec);
+              vst1q_f32(out_row1_1 + 2, out12);
+
+              out03 = vld1q_f32(out_row_1 + 3);
+              out03 = neon_vfma_lane_3(out03, in_vec, k1_vec);
+              vst1q_f32(out_row_1 + 3, out03);
+
+              out13 = vld1q_f32(out_row1_1 + 3);
+              out13 = neon_vfma_lane_3(out13, in_vec, k11_vec);
+              vst1q_f32(out_row1_1 + 3, out13);
+
+              //
+              out00 = vld1q_f32(out_row_2 + 0);
+              out00 = neon_vfma_lane_0(out00, in_vec, k2_vec);
+              vst1q_f32(out_row_2 + 0, out00);
+
+              out10 = vld1q_f32(out_row1_2 + 0);
+              out10 = neon_vfma_lane_0(out10, in_vec, k12_vec);
+              vst1q_f32(out_row1_2 + 0, out10);
+
+              out01 = vld1q_f32(out_row_2 + 1);
+              out01 = neon_vfma_lane_1(out01, in_vec, k2_vec);
+              vst1q_f32(out_row_2 + 1, out01);
+
+              out11 = vld1q_f32(out_row1_2 + 1);
+              out11 = neon_vfma_lane_1(out11, in_vec, k12_vec);
+              vst1q_f32(out_row1_2 + 1, out11);
+
+              out02 = vld1q_f32(out_row_2 + 2);
+              out02 = neon_vfma_lane_2(out02, in_vec, k2_vec);
+              vst1q_f32(out_row_2 + 2, out02);
+
+              out12 = vld1q_f32(out_row1_2 + 2);
+              out12 = neon_vfma_lane_2(out12, in_vec, k12_vec);
+              vst1q_f32(out_row1_2 + 2, out12);
+
+              out03 = vld1q_f32(out_row_2 + 3);
+              out03 = neon_vfma_lane_3(out03, in_vec, k2_vec);
+              vst1q_f32(out_row_2 + 3, out03);
+
+              out13 = vld1q_f32(out_row1_2 + 3);
+              out13 = neon_vfma_lane_3(out13, in_vec, k12_vec);
+              vst1q_f32(out_row1_2 + 3, out13);
+
+              //
+              out00 = vld1q_f32(out_row_3 + 0);
+              out00 = neon_vfma_lane_0(out00, in_vec, k3_vec);
+              vst1q_f32(out_row_3 + 0, out00);
+
+              out10 = vld1q_f32(out_row1_3 + 0);
+              out10 = neon_vfma_lane_0(out10, in_vec, k13_vec);
+              vst1q_f32(out_row1_3 + 0, out10);
+
+              out01 = vld1q_f32(out_row_3 + 1);
+              out01 = neon_vfma_lane_1(out01, in_vec, k3_vec);
+              vst1q_f32(out_row_3 + 1, out01);
+
+              out11 = vld1q_f32(out_row1_3 + 1);
+              out11 = neon_vfma_lane_1(out11, in_vec, k13_vec);
+              vst1q_f32(out_row1_3 + 1, out11);
+
+              out02 = vld1q_f32(out_row_3 + 2);
+              out02 = neon_vfma_lane_2(out02, in_vec, k3_vec);
+              vst1q_f32(out_row_3 + 2, out02);
+
+              out12 = vld1q_f32(out_row1_3 + 2);
+              out12 = neon_vfma_lane_2(out12, in_vec, k13_vec);
+              vst1q_f32(out_row1_3 + 2, out12);
+
+              out03 = vld1q_f32(out_row_3 + 3);
+              out03 = neon_vfma_lane_3(out03, in_vec, k3_vec);
+              vst1q_f32(out_row_3 + 3, out03);
+
+              out13 = vld1q_f32(out_row1_3 + 3);
+              out13 = neon_vfma_lane_3(out13, in_vec, k13_vec);
+              vst1q_f32(out_row1_3 + 3, out13);
+
+              in += 4;
+              out_row_0 += 4;
+              out_row_1 += 4;
+              out_row_2 += 4;
+              out_row_3 += 4;
+              out_row1_0 += 4;
+              out_row1_1 += 4;
+              out_row1_2 += 4;
+              out_row1_3 += 4;
+            }
+#endif
+            for (; j < w; j++) {
+              float val = in[0];
+              for (int k = 0; k < 4; ++k) {
+                out_row_0[k] += val * k0[k];
+                out_row_1[k] += val * k1[k];
+                out_row_2[k] += val * k2[k];
+                out_row_3[k] += val * k3[k];
+                out_row1_0[k] += val * k10[k];
+                out_row1_1[k] += val * k11[k];
+                out_row1_2[k] += val * k12[k];
+                out_row1_3[k] += val * k13[k];
+              }
+              in++;
+              out_row_0++;
+              out_row_1++;
+              out_row_2++;
+              out_row_3++;
+              out_row1_0++;
+              out_row1_1++;
+              out_row1_2++;
+              out_row1_3++;
+            }
+          }
+        }
+      } else {
+        float *out_base = output + (b * outch + oc) * out_img_size;
+        const float bias_value = bias ? bias[oc] : 0.f;
+        std::fill_n(out_base, out_img_size, bias_value);
+        for (int q = 0; q < inch; q++) {
+          const float *input_base = input + (b * inch + q) * h * w;
+          const float *kernel_base = filter + (oc * inch + q) * 16;
+          const float *in = input_base;
+          const float *k0 = kernel_base;
+          const float *k1 = kernel_base + 4;
+          const float *k2 = kernel_base + 8;
+          const float *k3 = kernel_base + 12;
+#if defined(MACE_ENABLE_NEON)
+          float32x4_t k0_vec = vld1q_f32(k0);
+          float32x4_t k1_vec = vld1q_f32(k1);
+          float32x4_t k2_vec = vld1q_f32(k2);
+          float32x4_t k3_vec = vld1q_f32(k3);
+#endif
+          for (int i = 0; i < h; i++) {
+            float *out_row = out_base + i * outw;
+            float *out_row_0 = out_row;
+            float *out_row_1 = out_row_0 + outw;
+            float *out_row_2 = out_row_1 + outw;
+            float *out_row_3 = out_row_2 + outw;
+            int j = 0;
+#if defined(MACE_ENABLE_NEON)
+            for (; j + 3 < w; j += 4) {
+              float32x4_t in_vec = vld1q_f32(in);
+
+              float32x4_t out00 = vld1q_f32(out_row_0);
+              out00 = neon_vfma_lane_0(out00, in_vec, k0_vec);
+              vst1q_f32(out_row_0, out00);
+
+              float32x4_t out01 = vld1q_f32(out_row_0 + 1);
+              out01 = neon_vfma_lane_1(out01, in_vec, k0_vec);
+              vst1q_f32(out_row_0 + 1, out01);
+
+              float32x4_t out02 = vld1q_f32(out_row_0 + 2);
+              out02 = neon_vfma_lane_2(out02, in_vec, k0_vec);
+              vst1q_f32(out_row_0 + 2, out02);
+
+              float32x4_t out03 = vld1q_f32(out_row_0 + 3);
+              out03 = neon_vfma_lane_3(out03, in_vec, k0_vec);
+              vst1q_f32(out_row_0 + 3, out03);
+
+              //
+              float32x4_t out10 = vld1q_f32(out_row_1);
+              out10 = neon_vfma_lane_0(out10, in_vec, k1_vec);
+              vst1q_f32(out_row_1, out10);
+
+              float32x4_t out11 = vld1q_f32(out_row_1 + 1);
+              out11 = neon_vfma_lane_1(out11, in_vec, k1_vec);
+              vst1q_f32(out_row_1 + 1, out11);
+
+              float32x4_t out12 = vld1q_f32(out_row_1 + 2);
+              out12 = neon_vfma_lane_2(out12, in_vec, k1_vec);
+              vst1q_f32(out_row_1 + 2, out12);
+
+              float32x4_t out13 = vld1q_f32(out_row_1 + 3);
+              out13 = neon_vfma_lane_3(out13, in_vec, k1_vec);
+              vst1q_f32(out_row_1 + 3, out13);
+
+              //
+              float32x4_t out20 = vld1q_f32(out_row_2 + 0);
+              out20 = neon_vfma_lane_0(out20, in_vec, k2_vec);
+              vst1q_f32(out_row_2 + 0, out20);
+
+              float32x4_t out21 = vld1q_f32(out_row_2 + 1);
+              out21 = neon_vfma_lane_1(out21, in_vec, k2_vec);
+              vst1q_f32(out_row_2 + 1, out21);
+
+              float32x4_t out22 = vld1q_f32(out_row_2 + 2);
+              out22 = neon_vfma_lane_2(out22, in_vec, k2_vec);
+              vst1q_f32(out_row_2 + 2, out22);
+
+              float32x4_t out23 = vld1q_f32(out_row_2 + 3);
+              out23 = neon_vfma_lane_3(out23, in_vec, k2_vec);
+              vst1q_f32(out_row_2 + 3, out23);
+
+              //
+              float32x4_t out30 = vld1q_f32(out_row_3 + 0);
+              out30 = neon_vfma_lane_0(out30, in_vec, k3_vec);
+              vst1q_f32(out_row_3 + 0, out30);
+
+              float32x4_t out31 = vld1q_f32(out_row_3 + 1);
+              out31 = neon_vfma_lane_1(out31, in_vec, k3_vec);
+              vst1q_f32(out_row_3 + 1, out31);
+
+              float32x4_t out32 = vld1q_f32(out_row_3 + 2);
+              out32 = neon_vfma_lane_2(out32, in_vec, k3_vec);
+              vst1q_f32(out_row_3 + 2, out32);
+
+              float32x4_t out33 = vld1q_f32(out_row_3 + 3);
+              out33 = neon_vfma_lane_3(out33, in_vec, k3_vec);
+              vst1q_f32(out_row_3 + 3, out33);
+
+              in += 4;
+              out_row_0 += 4;
+              out_row_1 += 4;
+              out_row_2 += 4;
+              out_row_3 += 4;
+            }
+#endif
+            for (; j < w; j++) {
+              float val = in[0];
+              for (int k = 0; k < 4; ++k) {
+                out_row_0[k] += val * k0[k];
+                out_row_1[k] += val * k1[k];
+                out_row_2[k] += val * k2[k];
+                out_row_3[k] += val * k3[k];
+              }
+              in++;
+              out_row_0++;
+              out_row_1++;
+              out_row_2++;
+              out_row_3++;
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+void Deconv2dNeonK4x4S2(const float *input,
+                        const float *filter,
+                        const float *bias,
+                        const index_t *in_shape,
+                        const index_t *out_shape,
+                        float *output) {
+  const index_t w = in_shape[3];
+  const index_t h = in_shape[2];
+  const index_t inch = in_shape[1];
+
+  const index_t outh = out_shape[2];
+  const index_t outw = out_shape[3];
+  const index_t outch = out_shape[1];
+  const index_t out_img_size = outh * outw;
+
+#pragma omp parallel for
+  for (int b = 0; b < out_shape[0]; ++b) {
+    for (int p = 0; p < outch; p++) {
+      float *out_base = output + (b * outch + p) * out_img_size;
+      const float bias_value = bias ? bias[p] : 0.f;
+      std::fill_n(out_base, outh * outw, bias_value);
+
+      for (int q = 0; q < inch; q++) {
+        const float *input_base = input + (b * inch + q) * h * w;
+        const float *kernel_base = filter + (p * inch + q) * 16;
+        const float *in = input_base;
+
+        const float *k0 = kernel_base;
+        const float *k1 = kernel_base + 4;
+        const float *k2 = kernel_base + 8;
+        const float *k3 = kernel_base + 12;
+#if defined(MACE_ENABLE_NEON)
+        float32x4_t k0_vec = vld1q_f32(k0);
+        float32x4_t k1_vec = vld1q_f32(k1);
+        float32x4_t k2_vec = vld1q_f32(k2);
+        float32x4_t k3_vec = vld1q_f32(k3);
+#endif
+        for (int i = 0; i < h; i++) {
+          float *out_row = out_base + 2 * i * outw;
+
+          float *out_row_0 = out_row;
+          float *out_row_1 = out_row_0 + outw;
+          float *out_row_2 = out_row_1 + outw;
+          float *out_row_3 = out_row_2 + outw;
+
+          int j = 0;
+#if defined(MACE_ENABLE_NEON)
+          for (; j + 3 < w; j += 4) {
+            float32x4_t in_vec = vld1q_f32(in);
+
+            // row 0
+            float32x4x2_t out0 = vld2q_f32(out_row_0);
+            out0.val[0] =
+              neon_vfma_lane_0(out0.val[0], in_vec, k0_vec);
+            out0.val[1] =
+              neon_vfma_lane_1(out0.val[1], in_vec, k0_vec);
+            vst2q_f32(out_row_0, out0);
+            out0 = vld2q_f32(out_row_0 + 2);
+            out0.val[0] =
+              neon_vfma_lane_2(out0.val[0], in_vec, k0_vec);
+            out0.val[1] =
+              neon_vfma_lane_3(out0.val[1], in_vec, k0_vec);
+            vst2q_f32(out_row_0 + 2, out0);
+
+            // row 1
+            float32x4x2_t out1 = vld2q_f32(out_row_1);
+            out1.val[0] =
+              neon_vfma_lane_0(out1.val[0], in_vec, k1_vec);
+            out1.val[1] =
+              neon_vfma_lane_1(out1.val[1], in_vec, k1_vec);
+            vst2q_f32(out_row_1, out1);
+            out1 = vld2q_f32(out_row_1 + 2);
+            out1.val[0] =
+              neon_vfma_lane_2(out1.val[0], in_vec, k1_vec);
+            out1.val[1] =
+              neon_vfma_lane_3(out1.val[1], in_vec, k1_vec);
+            vst2q_f32(out_row_1 + 2, out1);
+
+            // row 2
+            float32x4x2_t out2 = vld2q_f32(out_row_2);
+            out2.val[0] =
+              neon_vfma_lane_0(out2.val[0], in_vec, k2_vec);
+            out2.val[1] =
+              neon_vfma_lane_1(out2.val[1], in_vec, k2_vec);
+            vst2q_f32(out_row_2, out2);
+            out2 = vld2q_f32(out_row_2 + 2);
+            out2.val[0] =
+              neon_vfma_lane_2(out2.val[0], in_vec, k2_vec);
+            out2.val[1] =
+              neon_vfma_lane_3(out2.val[1], in_vec, k2_vec);
+            vst2q_f32(out_row_2 + 2, out2);
+
+            // row 3
+            float32x4x2_t out3 = vld2q_f32(out_row_3);
+            out3.val[0] =
+              neon_vfma_lane_0(out3.val[0], in_vec, k3_vec);
+            out3.val[1] =
+              neon_vfma_lane_1(out3.val[1], in_vec, k3_vec);
+            vst2q_f32(out_row_3, out3);
+            out3 = vld2q_f32(out_row_3 + 2);
+            out3.val[0] =
+              neon_vfma_lane_2(out3.val[0], in_vec, k3_vec);
+            out3.val[1] =
+              neon_vfma_lane_3(out3.val[1], in_vec, k3_vec);
+            vst2q_f32(out_row_3 + 2, out3);
+
+            in += 4;
+            out_row_0 += 8;
+            out_row_1 += 8;
+            out_row_2 += 8;
+            out_row_3 += 8;
+          }
+#endif
+          for (; j < w; j++) {
+            float val = in[0];
+            for (int k = 0; k < 4; ++k) {
+              out_row_0[k] += val * k0[k];
+              out_row_1[k] += val * k1[k];
+              out_row_2[k] += val * k2[k];
+              out_row_3[k] += val * k3[k];
+            }
+            in++;
+            out_row_0 += 2;
+            out_row_1 += 2;
+            out_row_2 += 2;
+            out_row_3 += 2;
+          }
+        }
+      }
+    }
+  }
+}
+
+}  // namespace kernels
+}  // namespace mace
--- a/mace/kernels/deconv_2d.h
+++ b/mace/kernels/deconv_2d.h
@@ -15,76 +15,25 @@
 #ifndef MACE_KERNELS_DECONV_2D_H_
 #define MACE_KERNELS_DECONV_2D_H_

-#if defined(MACE_ENABLE_NEON) && defined(__aarch64__)
+#if defined(MACE_ENABLE_NEON)
 #include <arm_neon.h>
 #endif
+
 #include <algorithm>
+#include <functional>
 #include <memory>
 #include <vector>

 #include "mace/core/future.h"
 #include "mace/core/tensor.h"
 #include "mace/kernels/activation.h"
+#include "mace/kernels/arm/deconv_2d_neon.h"
 #include "mace/kernels/conv_pool_2d_util.h"
 #include "mace/utils/utils.h"

 namespace mace {
 namespace kernels {

-namespace deconv {
-
-template<typename T>
-void Deconv2dNCHW(const T *input,
-                  const T *filter,
-                  const T *bias,
-                  const index_t *in_shape,
-                  const index_t *out_shape,
-                  const index_t *kernel_hw,
-                  const int *strides,
-                  const int *padding,
-                  float *output) {
-#pragma omp parallel for collapse(4)
-  for (index_t b = 0; b < out_shape[0]; ++b) {
-    for (index_t oc = 0; oc < out_shape[1]; ++oc) {
-      for (index_t oh = 0; oh < out_shape[2]; ++oh) {
-        for (index_t ow = 0; ow < out_shape[3]; ++ow) {
-          index_t filter_start_y, filter_start_x;
-          index_t start_x = std::max<int>(0, ow + strides[1] -1 - padding[1]);
-          index_t start_y = std::max<int>(0, oh + strides[0] -1 - padding[0]);
-          start_x /= strides[1];
-          start_y /= strides[0];
-          filter_start_x = padding[1] + strides[1] * start_x - ow;
-          filter_start_y = padding[0] + strides[0] * start_y - oh;
-          filter_start_x = kernel_hw[1] - 1 - filter_start_x;
-          filter_start_y = kernel_hw[0] - 1 - filter_start_y;
-          T out_value = 0;
-          index_t out_pos =
-              ((b * out_shape[1] + oc) * out_shape[2] + oh) * out_shape[3] + ow;
-          for (index_t ic = 0; ic < in_shape[1]; ++ic) {
-            for (index_t f_y = filter_start_y, ih = start_y;
-                 f_y >= 0 && ih < in_shape[2]; f_y -= strides[0], ++ih) {
-              for (index_t f_x = filter_start_x, iw = start_x;
-                  f_x >= 0 && iw < in_shape[3]; f_x -= strides[1], ++iw) {
-                  index_t weight_pos =
-                      ((oc * in_shape[1] + ic) * kernel_hw[0] + f_y)
-                          * kernel_hw[1] + f_x;
-                  index_t in_pos =
-                      ((b * in_shape[1] + ic) * in_shape[2] + ih)
-                          * in_shape[3] + iw;
-                  out_value += input[in_pos] * filter[weight_pos];
-              }
-            }
-          }
-          if (bias != nullptr)
-            out_value += bias[oc];
-          output[out_pos] = out_value;
-        }
-      }
-    }
-  }
-}
-}  // namespace deconv
-
 struct Deconv2dFunctorBase : OpKernel {
  Deconv2dFunctorBase(OpKernelContext *context,
                      const std::vector<int> &strides,
@@ -107,6 +56,7 @@ struct Deconv2dFunctorBase : OpKernel {
      const int *strides,
      index_t *output_shape,
      const int *padding_size,
+      int *input_padding,
      const bool isNCHW = false) {
    MACE_CHECK_NOTNULL(output_shape);
    MACE_CHECK_NOTNULL(padding_size);
@@ -119,13 +69,18 @@ struct Deconv2dFunctorBase : OpKernel {
    const index_t in_height = isNCHW ? input_shape[2] : input_shape[1];
    const index_t in_width = isNCHW ? input_shape[3] : input_shape[2];

-    const index_t filter_h = filter_shape[2];
-    const index_t filter_w = filter_shape[3];
+    const index_t kernel_h = filter_shape[2];
+    const index_t kernel_w = filter_shape[3];
+
+    input_padding[0] = static_cast<int>((kernel_h -1) * 2 - padding_size[0]);
+    input_padding[1] = static_cast<int>((kernel_w -1) * 2 - padding_size[1]);
+    input_padding[0] = std::max<int>(0, input_padding[0]);
+    input_padding[1] = std::max<int>(0, input_padding[1]);

    index_t out_height =
-        (in_height - 1) * strides[0] + filter_h -padding_size[0];
+        (in_height - 1) * strides[0] + kernel_h - padding_size[0];
    index_t out_width =
-        (in_width - 1) * strides[1] + filter_w -padding_size[1];
+        (in_width - 1) * strides[1] + kernel_w - padding_size[1];

    output_shape[0] = input_shape[0];
    if (isNCHW) {
@@ -206,8 +161,12 @@ struct Deconv2dFunctorBase : OpKernel {
  const float relux_max_limit_;
 };

-template <DeviceType D, typename T>
-struct Deconv2dFunctor : Deconv2dFunctorBase {
+
+template<DeviceType D, typename T>
+struct Deconv2dFunctor;
+
+template<>
+struct Deconv2dFunctor<DeviceType::CPU, float>: Deconv2dFunctorBase {
  Deconv2dFunctor(OpKernelContext *context,
                  const std::vector<int> &strides,
                  const Padding &padding_type,
@@ -223,6 +182,92 @@ struct Deconv2dFunctor : Deconv2dFunctorBase {
                            activation,
                            relux_max_limit) {}

+  void Deconv2dGeneral(const float *input,
+                       const float *filter,
+                       const float *bias,
+                       const index_t kernel_h,
+                       const index_t kernel_w,
+                       const int *strides,
+                       const index_t *in_shape,
+                       const index_t *out_shape,
+                       float *output) {
+    const index_t kernel_size = kernel_h * kernel_w;
+    std::vector<int> out_map(kernel_size);
+    int p0 = 0;
+    int p1 = 0;
+    index_t gap = out_shape[3] - kernel_w;
+    for (int i = 0; i < kernel_h; ++i) {
+      for (int j = 0; j < kernel_w; ++j) {
+        out_map[p0] = p1;
+        p0++;
+        p1++;
+      }
+      p1 += gap;
+    }
+
+    const index_t out_height = out_shape[2];
+    const index_t out_width = out_shape[3];
+    const index_t in_height = in_shape[2];
+    const index_t in_width = in_shape[3];
+    const index_t out_img_size = out_height * out_width;
+    const index_t in_img_size = in_height * in_width;
+
+#pragma omp parallel for
+    for (int b = 0; b < in_shape[0]; ++b) {
+      for (int oc = 0; oc < out_shape[1]; ++oc) {
+        float *out_base =
+            output + (b * out_shape[1] + oc) * out_img_size;
+        const float bias_value = bias ? bias[oc] : 0.f;
+        std::fill_n(out_base, out_img_size, bias_value);
+        for (int i = 0; i < in_height; ++i) {
+          for (int j = 0; j < in_width; ++j) {
+            const index_t out_offset =
+                i * strides[0] * out_width + j * strides[1];
+            for (int ic = 0; ic < in_shape[1]; ++ic) {
+              const index_t input_idx =
+                  (b * in_shape[1] + ic) * in_img_size + i * in_width + j;
+              const float val = input[input_idx];
+              const index_t kernel_offset =
+                  (oc * in_shape[1] + ic) * kernel_size;
+              for (int k = 0; k < kernel_size; ++k) {
+                const index_t out_idx = out_offset + out_map[k];
+                const index_t kernel_idx = kernel_offset + k;
+                out_base[out_idx] += val * filter[kernel_idx];
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+
+  void CropPadOut(const float *input,
+                  const index_t *in_shape,
+                  const index_t *out_shape,
+                  const index_t pad_h,
+                  const index_t pad_w,
+                  float *output) {
+    const index_t batch = in_shape[0];
+    const index_t channel = in_shape[1];
+    const index_t in_height = in_shape[2];
+    const index_t in_width = in_shape[3];
+
+    const index_t out_height = out_shape[2];
+    const index_t out_width = out_shape[3];
+#pragma omp parallel for
+    for (int i = 0; i < batch; ++i) {
+      for (int j = 0; j < channel; ++j) {
+        for (int k = 0; k < out_height; ++k) {
+          const float *input_base =
+              input + ((i * channel + j) * in_height + (k + pad_h)) * in_width;
+          float *output_base =
+              output + ((i * channel + j) * out_height + k)* out_width;
+          memcpy(output_base, input_base + pad_w, out_width * sizeof(float));
+        }
+      }
+    }
+  }
+
  MaceStatus operator()(const Tensor *input,   // NCHW
                  const Tensor *filter,  // OIHW
                  const Tensor *bias,
@@ -235,6 +280,7 @@ struct Deconv2dFunctor : Deconv2dFunctorBase {
    MACE_CHECK_NOTNULL(output);

    std::vector<int> paddings(2);
+    std::vector<int> out_paddings(2);
    std::vector<index_t> output_shape(4);
    if (paddings_.empty()) {  // tensorflow
      paddings = std::vector<int>(2, 0);
@@ -261,19 +307,20 @@ struct Deconv2dFunctor : Deconv2dFunctorBase {
          output_shape.data(),
          paddings.data(), true);
    } else {  // caffe
-      paddings = paddings_;
+      out_paddings = paddings_;
      output_shape = std::vector<index_t>(4, 0);
      CalcDeconvOutputSize(input->shape().data(),
                           filter->shape().data(),
                           strides_.data(),
                           output_shape.data(),
-                           paddings.data(), true);
+                           out_paddings.data(),
+                           paddings.data(),
+                           true);
    }
    MACE_RETURN_IF_ERROR(output->Resize(output_shape));
    index_t kernel_h = filter->dim(2);
    index_t kernel_w = filter->dim(3);
    const index_t *in_shape = input->shape().data();
-    const index_t kernel_hw[2] = {kernel_h, kernel_w};

    MACE_CHECK(filter->dim(0) == output_shape[1], filter->dim(0), " != ",
               output_shape[1]);
@@ -281,28 +328,144 @@ struct Deconv2dFunctor : Deconv2dFunctorBase {
               in_shape[1]);
    MACE_CHECK(in_shape[0] == output_shape[0],
               "Input/Output batch size mismatch");
+    std::function<void(const float *input,
+                       const float *filter,
+                       const float *bias,
+                       const index_t *in_shape,
+                       const index_t *out_shape,
+                       float *output)> deconv_func;
+
    Tensor::MappingGuard input_mapper(input);
    Tensor::MappingGuard filter_mapper(filter);
    Tensor::MappingGuard bias_mapper(bias);
    Tensor::MappingGuard output_mapper(output);
-    auto input_data = input->data<T>();
-    auto filter_data = filter->data<T>();
-    auto bias_data = bias == nullptr ? nullptr : bias->data<T>();
-    auto output_data = output->mutable_data<T>();
-    int padding[2];
-    padding[0] = (paddings[0] + 1) >> 1;
-    padding[1] = (paddings[1] + 1) >> 1;
-    deconv::Deconv2dNCHW(input_data,
-                         filter_data,
-                         bias_data,
-                         in_shape,
-                         output_shape.data(),
-                         kernel_hw,
-                         strides_.data(),
-                         padding,
-                         output_data);
-
-    DoActivation(output_data,
+    auto input_data = input->data<float>();
+    auto filter_data = filter->data<float>();
+    auto bias_data = bias == nullptr ? nullptr : bias->data<float>();
+    auto output_data = output->mutable_data<float>();
+
+    const index_t padded_out_h = (in_shape[2] - 1) * strides_[0] + kernel_h;
+    const index_t padded_out_w = (in_shape[3] - 1) * strides_[1] + kernel_w;
+    const index_t pad_h = (padded_out_h - output_shape[2]) / 2;
+    const index_t pad_w = (padded_out_w - output_shape[3]) / 2;
+
+    std::vector<index_t> padded_out_shape({output_shape[0], output_shape[1],
+                                           padded_out_h, padded_out_w});
+    index_t padded_out_size =
+        std::accumulate(padded_out_shape.begin(),
+                        padded_out_shape.end(),
+                        1,
+                        std::multiplies<index_t>()) * sizeof(float);
+    ScratchBuffer *scratch = context_->device()->scratch_buffer();
+    scratch->Rewind();
+    scratch->GrowSize(padded_out_size);
+    Tensor padded_out(scratch->Scratch(padded_out_size), DT_FLOAT);
+    auto *padded_out_data = padded_out.mutable_data<float>();
+
+    bool use_neon_3x3_s1 = kernel_h == kernel_w && kernel_h == 3 &&
+        strides_[0] == strides_[1] && strides_[0] == 1;
+    bool use_neon_3x3_s2 = kernel_h == kernel_w && kernel_h == 3 &&
+        strides_[0] == strides_[1] && strides_[0] == 2;
+
+    bool use_neon_4x4_s1 = kernel_h == kernel_w && kernel_h == 4 &&
+        strides_[0] == strides_[1] && strides_[0] == 1;
+    bool use_neon_4x4_s2 = kernel_h == kernel_w && kernel_h == 4 &&
+        strides_[0] == strides_[1] && strides_[0] == 2;
+
+    if (use_neon_3x3_s1) {
+      deconv_func = [=](const float *input,
+                        const float *filter,
+                        const float *bias,
+                        const index_t *in_shape,
+                        const index_t *padded_out_shape,
+                        float *padded_output) {
+        Deconv2dNeonK3x3S1(input,
+                           filter,
+                           bias,
+                           in_shape,
+                           padded_out_shape,
+                           padded_output);
+      };
+    } else if (use_neon_3x3_s2) {
+      deconv_func = [=](const float *input,
+                        const float *filter,
+                        const float *bias,
+                        const index_t *in_shape,
+                        const index_t *padded_out_shape,
+                        float *padded_output) {
+        Deconv2dNeonK3x3S2(input,
+                           filter,
+                           bias,
+                           in_shape,
+                           padded_out_shape,
+                           padded_output);
+      };
+    } else if (use_neon_4x4_s1) {
+      deconv_func = [=](const float *input,
+                        const float *filter,
+                        const float *bias,
+                        const index_t *in_shape,
+                        const index_t *padded_out_shape,
+                        float *padded_output) {
+        Deconv2dNeonK4x4S1(input,
+                           filter,
+                           bias,
+                           in_shape,
+                           padded_out_shape,
+                           padded_output);
+      };
+    } else if (use_neon_4x4_s2) {
+      deconv_func = [=](const float *input,
+                        const float *filter,
+                        const float *bias,
+                        const index_t *in_shape,
+                        const index_t *padded_out_shape,
+                        float *padded_output) {
+        Deconv2dNeonK4x4S2(input,
+                           filter,
+                           bias,
+                           in_shape,
+                           padded_out_shape,
+                           padded_output);
+      };
+    } else {
+      deconv_func = [=](const float *input,
+                        const float *filter,
+                        const float *bias,
+                        const index_t *in_shape,
+                        const index_t *padded_out_shape,
+                        float *padded_output) {
+        Deconv2dGeneral(input,
+                        filter,
+                        bias,
+                        kernel_h,
+                        kernel_w,
+                        strides_.data(),
+                        in_shape,
+                        padded_out_shape,
+                        padded_output);
+      };
+    }
+
+    bool no_pad =
+        padded_out_h == output_shape[2] && padded_out_w == output_shape[3];
+    float *out_data = no_pad ? output_data : padded_out_data;
+    deconv_func(input_data,
+                filter_data,
+                bias_data,
+                in_shape,
+                padded_out_shape.data(),
+                out_data);
+    if (!no_pad) {
+      CropPadOut(out_data,
+                 padded_out_shape.data(),
+                 output_shape.data(),
+                 pad_h,
+                 pad_w,
+                 output_data);
+    }
+
+    DoActivation<float>(output_data,
                 output_data,
                 output->size(),
                 activation_,

--- a/mace/kernels/opencl/deconv_2d.cc
+++ b/mace/kernels/opencl/deconv_2d.cc
@@ -53,6 +53,7 @@ MaceStatus Deconv2dFunctor<DeviceType::GPU, T>::operator()(
  MACE_CHECK_NOTNULL(filter);
  MACE_CHECK_NOTNULL(output);
  std::vector<int> paddings(2);
+  std::vector<int> out_paddings(2);
  std::vector<index_t> output_shape(4);
  if (paddings_.empty()) {
    paddings = std::vector<int>(2, 0);
@@ -74,12 +75,14 @@ MaceStatus Deconv2dFunctor<DeviceType::GPU, T>::operator()(
                                  output_shape.data(),
                                  paddings.data());
  } else {
-    paddings = paddings_;
+    out_paddings = paddings_;
+    paddings = std::vector<int>(2, 0);
    output_shape = std::vector<index_t>(4, 0);
    CalcDeconvOutputSize(input->shape().data(),
                         filter->shape().data(),
                         strides_.data(),
                         output_shape.data(),
+                         out_paddings.data(),
                         paddings.data());
  }


--- a/mace/ops/deconv_2d_benchmark.cc
+++ b/mace/ops/deconv_2d_benchmark.cc
@@ -116,6 +116,7 @@ static void Deconv2d(int iters,

 // TODO(liutuo): add cpu benchmark when optimized.
 #define MACE_BM_DECONV_2D(N, C, H, W, KH, KW, S, OH, OW, P, OC)              \
+  MACE_BM_DECONV_2D_MACRO(N, C, H, W, KH, KW, S, OH, OW, P, OC, float, CPU); \
  MACE_BM_DECONV_2D_MACRO(N, C, H, W, KH, KW, S, OH, OW, P, OC, float, GPU); \
  MACE_BM_DECONV_2D_MACRO(N, C, H, W, KH, KW, S, OH, OW, P, OC, half, GPU);

@@ -124,6 +125,11 @@ MACE_BM_DECONV_2D(1, 32, 60, 60, 1, 1, 1, 60, 60, VALID, 128);

 MACE_BM_DECONV_2D(1, 128, 60, 60, 3, 3, 1, 62, 62, VALID, 128);
 MACE_BM_DECONV_2D(1, 32, 60, 60, 3, 3, 1, 60, 60, SAME, 32);
+
+MACE_BM_DECONV_2D(1, 128, 60, 60, 4, 4, 1, 63, 63, VALID, 128);
+MACE_BM_DECONV_2D(1, 32, 60, 60, 4, 4, 1, 60, 60, SAME, 32);
+MACE_BM_DECONV_2D(1, 3, 224, 224, 4, 4, 2, 448, 448, SAME, 32);
+MACE_BM_DECONV_2D(1, 3, 224, 224, 4, 4, 2, 450, 450, VALID, 32);
 MACE_BM_DECONV_2D(1, 3, 512, 512, 7, 7, 2, 1023, 1023, SAME, 32);
 MACE_BM_DECONV_2D(1, 128, 16, 16, 5, 5, 1, 20, 20, VALID, 32);
 MACE_BM_DECONV_2D(1, 128, 64, 64, 5, 5, 1, 68, 68, VALID, 32);
@@ -134,6 +140,7 @@ MACE_BM_DECONV_2D(1, 64, 32, 32, 1, 1, 1, 32, 32, VALID, 128);
 MACE_BM_DECONV_2D(1, 64, 33, 32, 3, 3, 2, 65, 63, SAME, 128);
 MACE_BM_DECONV_2D(1, 3, 224, 224, 3, 3, 2, 447, 447, SAME, 32);
 MACE_BM_DECONV_2D(1, 3, 224, 224, 3, 3, 2, 449, 449, VALID, 32);
+MACE_BM_DECONV_2D(1, 3, 224, 224, 3, 3, 2, 448, 448, SAME, 32);

 }  // namespace test
 }  // namespace ops

--- a/mace/ops/deconv_2d_test.cc
+++ b/mace/ops/deconv_2d_test.cc
@@ -380,11 +380,11 @@ void TestComplexDeconvNxNS12(const int batch,
                            1e-4);
  };

-  for (int kernel_size : {1, 3, 5, 7}) {
+  for (int kernel_size : {3, 4, 5, 7}) {
    func(kernel_size, kernel_size, stride, stride, VALID, -1);
    func(kernel_size, kernel_size, stride, stride, SAME, -1);
-    func(kernel_size, kernel_size, stride, stride, VALID, 1);
    func(kernel_size, kernel_size, stride, stride, VALID, 2);
+    func(kernel_size, kernel_size, stride, stride, VALID, 3);
  }
 }
 }  // namespace
@@ -410,8 +410,8 @@ TEST_F(Deconv2dOpTest, OPENCLUnalignedDeconvNxNS34) {
 }

 TEST_F(Deconv2dOpTest, OPENCLUnalignedDeconvNxNMultiBatch) {
-  TestComplexDeconvNxNS12<DeviceType::GPU, float>(3, {17, 13, 5, 7}, 1);
-  TestComplexDeconvNxNS12<DeviceType::GPU, float>(5, {17, 13, 5, 7}, 2);
+  TestComplexDeconvNxNS12<DeviceType::GPU, float>(3, {17, 113, 5, 7}, 1);
+  TestComplexDeconvNxNS12<DeviceType::GPU, float>(5, {17, 113, 5, 7}, 2);
 }

 }  // namespace test