From c057b7ec5c7df03fe3fc0da7f4ce1856056e1b85 Mon Sep 17 00:00:00 2001
From: liutuo <liutuo@xiaomi.com>
Date: Fri, 28 Sep 2018 09:56:04 +0800
Subject: [PATCH] add depthwise deconv

---
 mace/ops/arm/common_neon.h                    |  70 ++
 mace/ops/arm/deconv_2d_neon.h                 |  47 +-
 mace/ops/arm/depthwise_deconv2d_neon.h        |  79 ++
 mace/ops/arm/depthwise_deconv2d_neon_3x3.cc   | 627 ++++++++++++++
 mace/ops/arm/depthwise_deconv2d_neon_4x4.cc   | 805 ++++++++++++++++++
 mace/ops/deconv_2d.cc                         |  39 +-
 mace/ops/deconv_2d.h                          |  30 +
 mace/ops/depthwise_deconv2d.cc                | 547 ++++++++++++
 mace/ops/depthwise_deconv2d_benchmark.cc      | 130 +++
 mace/ops/depthwise_deconv2d_test.cc           | 286 +++++++
 mace/ops/opencl/cl/deconv_2d.cl               |   3 +-
 mace/ops/opencl/cl/depthwise_deconv2d.cl      | 149 ++++
 mace/ops/opencl/depthwise_deconv2d.h          |  49 ++
 mace/ops/opencl/image/deconv_2d.h             |   4 +-
 mace/ops/opencl/image/depthwise_deconv2d.h    | 196 +++++
 mace/ops/ops_registry.cc                      |   2 +
 .../tools/converter_tool/base_converter.py    |   2 +
 .../tools/converter_tool/caffe_converter.py   |  17 +-
 .../tools/converter_tool/shape_inference.py   |   5 +
 .../tools/converter_tool/transformer.py       |  53 +-
 .../opencl-kernel/opencl_kernel_configure.bzl |   1 +
 tools/validate.py                             |  12 +-
 22 files changed, 3037 insertions(+), 116 deletions(-)
 create mode 100644 mace/ops/arm/common_neon.h
 create mode 100644 mace/ops/arm/depthwise_deconv2d_neon.h
 create mode 100644 mace/ops/arm/depthwise_deconv2d_neon_3x3.cc
 create mode 100644 mace/ops/arm/depthwise_deconv2d_neon_4x4.cc
 create mode 100644 mace/ops/depthwise_deconv2d.cc
 create mode 100644 mace/ops/depthwise_deconv2d_benchmark.cc
 create mode 100644 mace/ops/depthwise_deconv2d_test.cc
 create mode 100644 mace/ops/opencl/cl/depthwise_deconv2d.cl
 create mode 100644 mace/ops/opencl/depthwise_deconv2d.h
 create mode 100644 mace/ops/opencl/image/depthwise_deconv2d.h

diff --git a/mace/ops/arm/common_neon.h b/mace/ops/arm/common_neon.h
new file mode 100644
index 00000000..be0daf79
--- /dev/null
+++ b/mace/ops/arm/common_neon.h
@@ -0,0 +1,70 @@
+// Copyright 2018 Xiaomi, Inc.  All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef MACE_OPS_ARM_COMMON_NEON_H_
+#define MACE_OPS_ARM_COMMON_NEON_H_
+
+#if defined(MACE_ENABLE_NEON)
+#include <arm_neon.h>
+#endif
+
+namespace mace {
+namespace ops {
+
+#ifdef MACE_ENABLE_NEON
+inline float32x4_t neon_vfma_lane_0(float32x4_t a,
+                          float32x4_t b,
+                          float32x4_t c) {
+#ifdef __aarch64__
+  return vfmaq_laneq_f32(a, b, c, 0);
+#else
+  return vmlaq_lane_f32(a, b, vget_low_f32(c), 0);
+#endif
+}
+
+inline float32x4_t neon_vfma_lane_1(float32x4_t a,
+                          float32x4_t b,
+                          float32x4_t c) {
+#ifdef __aarch64__
+  return vfmaq_laneq_f32(a, b, c, 1);
+#else
+  return vmlaq_lane_f32(a, b, vget_low_f32(c), 1);
+#endif
+}
+
+inline float32x4_t neon_vfma_lane_2(float32x4_t a,
+                          float32x4_t b,
+                          float32x4_t c) {
+#ifdef __aarch64__
+  return vfmaq_laneq_f32(a, b, c, 2);
+#else
+  return vmlaq_lane_f32(a, b, vget_high_f32(c), 0);
+#endif
+}
+
+inline float32x4_t neon_vfma_lane_3(float32x4_t a,
+                          float32x4_t b,
+                          float32x4_t c) {
+#ifdef __aarch64__
+  return vfmaq_laneq_f32(a, b, c, 3);
+#else
+  return vmlaq_lane_f32(a, b, vget_high_f32(c), 1);
+#endif
+}
+#endif
+
+}  // namespace ops
+}  // namespace mace
+
+#endif  // MACE_OPS_ARM_COMMON_NEON_H_
diff --git a/mace/ops/arm/deconv_2d_neon.h b/mace/ops/arm/deconv_2d_neon.h
index d8abe427..abed4926 100644
--- a/mace/ops/arm/deconv_2d_neon.h
+++ b/mace/ops/arm/deconv_2d_neon.h
@@ -15,11 +15,8 @@
 #ifndef MACE_OPS_ARM_DECONV_2D_NEON_H_
 #define MACE_OPS_ARM_DECONV_2D_NEON_H_
 
-#if defined(MACE_ENABLE_NEON)
-#include <arm_neon.h>
-#endif
-
 #include "mace/core/types.h"
+#include "mace/ops/arm/common_neon.h"
 
 namespace mace {
 namespace ops {
@@ -48,48 +45,6 @@ void Deconv2dNeonK4x4S2(const float *input,
                         const index_t *out_shape,
                         float *output);
 
-#ifdef MACE_ENABLE_NEON
-inline float32x4_t neon_vfma_lane_0(float32x4_t a,
-                          float32x4_t b,
-                          float32x4_t c) {
-#ifdef __aarch64__
-  return vfmaq_laneq_f32(a, b, c, 0);
-#else
-  return vmlaq_lane_f32(a, b, vget_low_f32(c), 0);
-#endif
-}
-
-inline float32x4_t neon_vfma_lane_1(float32x4_t a,
-                          float32x4_t b,
-                          float32x4_t c) {
-#ifdef __aarch64__
-  return vfmaq_laneq_f32(a, b, c, 1);
-#else
-  return vmlaq_lane_f32(a, b, vget_low_f32(c), 1);
-#endif
-}
-
-inline float32x4_t neon_vfma_lane_2(float32x4_t a,
-                          float32x4_t b,
-                          float32x4_t c) {
-#ifdef __aarch64__
-  return vfmaq_laneq_f32(a, b, c, 2);
-#else
-  return vmlaq_lane_f32(a, b, vget_high_f32(c), 0);
-#endif
-}
-
-inline float32x4_t neon_vfma_lane_3(float32x4_t a,
-                          float32x4_t b,
-                          float32x4_t c) {
-#ifdef __aarch64__
-  return vfmaq_laneq_f32(a, b, c, 3);
-#else
-  return vmlaq_lane_f32(a, b, vget_high_f32(c), 1);
-#endif
-}
-#endif
-
 }  // namespace ops
 }  // namespace mace
 
diff --git a/mace/ops/arm/depthwise_deconv2d_neon.h b/mace/ops/arm/depthwise_deconv2d_neon.h
new file mode 100644
index 00000000..35460d75
--- /dev/null
+++ b/mace/ops/arm/depthwise_deconv2d_neon.h
@@ -0,0 +1,79 @@
+// Copyright 2018 Xiaomi, Inc.  All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef MACE_OPS_ARM_DEPTHWISE_DECONV2D_NEON_H_
+#define MACE_OPS_ARM_DEPTHWISE_DECONV2D_NEON_H_
+
+#include "mace/core/types.h"
+#include "mace/ops/arm/common_neon.h"
+
+namespace mace {
+namespace ops {
+
+void DepthwiseDeconv2dNeonK3x3S1(const float *input,
+                                 const float *filter,
+                                 const index_t *in_shape,
+                                 const index_t *out_shape,
+                                 float *output);
+
+void DepthwiseDeconv2dNeonK3x3S2(const float *input,
+                                 const float *filter,
+                                 const index_t *in_shape,
+                                 const index_t *out_shape,
+                                 float *output);
+
+void DepthwiseDeconv2dNeonK4x4S1(const float *input,
+                                 const float *filter,
+                                 const index_t *in_shape,
+                                 const index_t *out_shape,
+                                 float *output);
+
+void DepthwiseDeconv2dNeonK4x4S2(const float *input,
+                                 const float *filter,
+                                 const index_t *in_shape,
+                                 const index_t *out_shape,
+                                 float *output);
+
+void GroupDeconv2dNeonK3x3S1(const float *input,
+                             const float *filter,
+                             const int group,
+                             const index_t *in_shape,
+                             const index_t *out_shape,
+                             float *output);
+
+void GroupDeconv2dNeonK3x3S2(const float *input,
+                             const float *filter,
+                             const int group,
+                             const index_t *in_shape,
+                             const index_t *out_shape,
+                             float *output);
+
+void GroupDeconv2dNeonK4x4S1(const float *input,
+                             const float *filter,
+                             const int group,
+                             const index_t *in_shape,
+                             const index_t *out_shape,
+                             float *output);
+
+void GroupDeconv2dNeonK4x4S2(const float *input,
+                             const float *filter,
+                             const int group,
+                             const index_t *in_shape,
+                             const index_t *out_shape,
+                             float *output);
+
+}  // namespace ops
+}  // namespace mace
+
+#endif  // MACE_OPS_ARM_DEPTHWISE_DECONV2D_NEON_H_
diff --git a/mace/ops/arm/depthwise_deconv2d_neon_3x3.cc b/mace/ops/arm/depthwise_deconv2d_neon_3x3.cc
new file mode 100644
index 00000000..8a90b9fc
--- /dev/null
+++ b/mace/ops/arm/depthwise_deconv2d_neon_3x3.cc
@@ -0,0 +1,627 @@
+// Copyright 2018 Xiaomi, Inc.  All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "mace/core/macros.h"
+#include "mace/ops/arm/depthwise_deconv2d_neon.h"
+
+namespace mace {
+namespace ops {
+
+void DepthwiseDeconv2dNeonK3x3S1(const float *input,
+                                 const float *filter,
+                                 const index_t *in_shape,
+                                 const index_t *out_shape,
+                                 float *output) {
+  const index_t channels = in_shape[1];
+  const index_t h = in_shape[2];
+  const index_t w = in_shape[3];
+  const index_t in_img_size = h * w;
+
+  const index_t outh = out_shape[2];
+  const index_t outw = out_shape[3];
+  const index_t out_img_size = outh * outw;
+
+#pragma omp parallel for collapse(2)
+  for (index_t b = 0; b < out_shape[0]; ++b) {
+    for (index_t c = 0; c < channels; ++c) {
+      const index_t offset = b * channels + c;
+      float *out_base = output + offset * out_img_size;
+      const float *input_base = input + offset * in_img_size;
+      const float *kernel_base = filter + c * 9;
+      const float *in = input_base;
+      const float *k0 = kernel_base;
+      const float *k1 = kernel_base + 3;
+      const float *k2 = kernel_base + 5;
+
+#if defined(MACE_ENABLE_NEON)
+      // load filter
+      float32x4_t k0_vec = vld1q_f32(k0);
+      float32x4_t k1_vec = vld1q_f32(k1);
+      float32x4_t k2_vec = vld1q_f32(k2);
+#endif
+      for (index_t i = 0; i < h; ++i) {
+        float *out_row_base = out_base + i * outw;
+        float *out_row0 = out_row_base;
+        float *out_row1 = out_row_base + outw;
+        float *out_row2 = out_row_base + 2 * outw;
+        index_t j = 0;
+#if defined(MACE_ENABLE_NEON)
+        for (; j + 3 < w; j += 4) {
+          float32x4_t in_vec = vld1q_f32(in);
+
+          float32x4_t out00, out01, out02;
+          float32x4_t out10, out11, out12;
+          float32x4_t out20, out21, out22;
+
+          out00 = vld1q_f32(out_row0 + 0);
+          out00 = neon_vfma_lane_0(out00, in_vec, k0_vec);
+          vst1q_f32(out_row0 + 0, out00);
+
+          out01 = vld1q_f32(out_row0 + 1);
+          out01 = neon_vfma_lane_1(out01, in_vec, k0_vec);
+          vst1q_f32(out_row0 + 1, out01);
+
+          out02 = vld1q_f32(out_row0 + 2);
+          out02 = neon_vfma_lane_2(out02, in_vec, k0_vec);
+          vst1q_f32(out_row0 + 2, out02);
+
+          out10 = vld1q_f32(out_row1 + 0);
+          out10 = neon_vfma_lane_0(out10, in_vec, k1_vec);
+          vst1q_f32(out_row1 + 0, out10);
+
+          out11 = vld1q_f32(out_row1 + 1);
+          out11 = neon_vfma_lane_1(out11, in_vec, k1_vec);
+          vst1q_f32(out_row1 + 1, out11);
+
+          out12 = vld1q_f32(out_row1 + 2);
+          out12 = neon_vfma_lane_2(out12, in_vec, k1_vec);
+          vst1q_f32(out_row1 + 2, out12);
+
+          out20 = vld1q_f32(out_row2 + 0);
+          out20 = neon_vfma_lane_1(out20, in_vec, k2_vec);
+          vst1q_f32(out_row2 + 0, out20);
+
+          out21 = vld1q_f32(out_row2 + 1);
+          out21 = neon_vfma_lane_2(out21, in_vec, k2_vec);
+          vst1q_f32(out_row2 + 1, out21);
+
+          out22 = vld1q_f32(out_row2 + 2);
+          out22 = neon_vfma_lane_3(out22, in_vec, k2_vec);
+          vst1q_f32(out_row2 + 2, out22);
+
+          in += 4;
+          out_row0 += 4;
+          out_row1 += 4;
+          out_row2 += 4;
+        }
+#endif
+        for (; j < w; ++j) {
+          float val = in[0];
+          for (int k = 0; k < 3; ++k) {
+            out_row0[k] += val * k0[k];
+            out_row1[k] += val * k1[k];
+            out_row2[k] += val * k2[k + 1];
+          }
+          in++;
+          out_row0++;
+          out_row1++;
+          out_row2++;
+        }
+      }
+    }
+  }
+}
+
+void DepthwiseDeconv2dNeonK3x3S2(const float *input,
+                                 const float *filter,
+                                 const index_t *in_shape,
+                                 const index_t *out_shape,
+                                 float *output) {
+  const index_t channels = in_shape[1];
+  const index_t h = in_shape[2];
+  const index_t w = in_shape[3];
+  const index_t in_img_size = h * w;
+
+  const index_t outh = out_shape[2];
+  const index_t outw = out_shape[3];
+  const index_t out_img_size = outh * outw;
+
+#pragma omp parallel for collapse(2)
+  for (index_t b = 0; b < out_shape[0]; ++b) {
+    for (index_t c = 0; c < channels; ++c) {
+      const index_t offset = b * channels + c;
+      float *out_base = output + offset * out_img_size;
+      const float *input_base = input + offset * in_img_size;
+      const float *kernel_base = filter + c * 9;
+      const float *in = input_base;
+
+      const float *k0 = kernel_base;
+      const float *k1 = kernel_base + 3;
+      const float *k2 = kernel_base + 5;
+
+#if defined(MACE_ENABLE_NEON)
+      float32x4_t k0_vec = vld1q_f32(k0);
+      float32x4_t k1_vec = vld1q_f32(k1);
+      float32x4_t k2_vec = vld1q_f32(k2);
+#endif
+      for (index_t i = 0; i < h; ++i) {
+        float *out_row_base = out_base + i * 2 * outw;
+        float *out_row_0 = out_row_base;
+        float *out_row_1 = out_row_0 + outw;
+        float *out_row_2 = out_row_1 + outw;
+
+        index_t j = 0;
+#if defined(MACE_ENABLE_NEON)
+        for (; j + 3 < w; j += 4) {
+          float32x4_t in_vec = vld1q_f32(in);
+
+          // out row 0
+          float32x4x2_t out00 = vld2q_f32(out_row_0);
+          out00.val[0] =
+            neon_vfma_lane_0(out00.val[0], in_vec, k0_vec);
+          out00.val[1] =
+            neon_vfma_lane_1(out00.val[1], in_vec, k0_vec);
+          vst2q_f32(out_row_0, out00);
+
+          float32x4x2_t out01 = vld2q_f32(out_row_0 + 2);
+          out01.val[0] =
+            neon_vfma_lane_2(out01.val[0], in_vec, k0_vec);
+          vst2q_f32(out_row_0 + 2, out01);
+
+          // out row 1
+          float32x4x2_t out10 = vld2q_f32(out_row_1);
+          out10.val[0] =
+            neon_vfma_lane_0(out10.val[0], in_vec, k1_vec);
+          out10.val[1] =
+            neon_vfma_lane_1(out10.val[1], in_vec, k1_vec);
+          vst2q_f32(out_row_1, out10);
+
+          float32x4x2_t out11 = vld2q_f32(out_row_1 + 2);
+          out11.val[0] =
+            neon_vfma_lane_2(out11.val[0], in_vec, k1_vec);
+          vst2q_f32(out_row_1 + 2, out11);
+
+          // out row 2
+          float32x4x2_t out20 = vld2q_f32(out_row_2);
+          out20.val[0] =
+            neon_vfma_lane_1(out20.val[0], in_vec, k2_vec);
+          out20.val[1] =
+            neon_vfma_lane_2(out20.val[1], in_vec, k2_vec);
+          vst2q_f32(out_row_2, out20);
+
+          float32x4x2_t out21 = vld2q_f32(out_row_2 + 2);
+          out21.val[0] =
+            neon_vfma_lane_3(out21.val[0], in_vec, k2_vec);
+          vst2q_f32(out_row_2 + 2, out21);
+
+          in += 4;
+          out_row_0 += 8;
+          out_row_1 += 8;
+          out_row_2 += 8;
+        }
+#endif
+        for (; j < w; ++j) {
+          float val = in[0];
+
+          for (int k = 0; k < 3; ++k) {
+            out_row_0[k] += val * k0[k];
+            out_row_1[k] += val * k1[k];
+            out_row_2[k] += val * k2[k + 1];
+          }
+
+          in++;
+          out_row_0 += 2;
+          out_row_1 += 2;
+          out_row_2 += 2;
+        }
+      }
+    }
+  }
+}
+
+void GroupDeconv2dNeonK3x3S1(const float *input,
+                             const float *filter,
+                             const int group,
+                             const index_t *in_shape,
+                             const index_t *out_shape,
+                             float *output) {
+  const index_t inch = in_shape[1];
+  const index_t h = in_shape[2];
+  const index_t w = in_shape[3];
+
+  const index_t outch = out_shape[1];
+  const index_t outh = out_shape[2];
+  const index_t outw = out_shape[3];
+
+  const index_t in_img_size = h * w;
+  const index_t out_img_size = outh * outw;
+
+  const index_t inch_g = inch / group;
+  const index_t outch_g = outch / group;
+
+#pragma omp parallel for collapse(3)
+  for (index_t b = 0; b < out_shape[0]; ++b) {
+    for (int g = 0; g < group; ++g) {
+      for (index_t oc = 0; oc < outch_g; oc += 2) {
+        if (oc + 1 < outch_g) {
+          const index_t out_offset = b * outch + outch_g * g + oc;
+          float *out_base0 = output + out_offset * out_img_size;
+          float *out_base1 = out_base0 + out_img_size;
+          for (index_t ic = 0; ic < inch_g; ++ic) {
+            const index_t in_offset = b * inch + inch_g * g + ic;
+            const float *input_base = input + in_offset * in_img_size;
+            const index_t kernel_offset = (oc * group + g) * inch_g + ic;
+            const float *kernel_base0 = filter + kernel_offset * 9;
+            const float *kernel_base1 = kernel_base0 + inch * 9;
+            const float *in = input_base;
+
+            // output channel 0
+            const float *k0_0 = kernel_base0;
+            const float *k0_1 = kernel_base0 + 3;
+            const float *k0_2 = kernel_base0 + 5;
+            // output channel 1
+            const float *k1_0 = kernel_base1;
+            const float *k1_1 = kernel_base1 + 3;
+            const float *k1_2 = kernel_base1 + 5;
+
+#if defined(MACE_ENABLE_NEON)
+            // load filter
+          float32x4_t k00_vec, k01_vec, k02_vec;
+          float32x4_t k10_vec, k11_vec, k12_vec;
+
+          k00_vec = vld1q_f32(k0_0);
+          k01_vec = vld1q_f32(k0_1);
+          k02_vec = vld1q_f32(k0_2);
+
+          k10_vec = vld1q_f32(k1_0);
+          k11_vec = vld1q_f32(k1_1);
+          k12_vec = vld1q_f32(k1_2);
+#endif
+            for (index_t i = 0; i < h; ++i) {
+              float *out_row_base0 = out_base0 + i * outw;
+              float *out_row0_0 = out_row_base0;
+              float *out_row0_1 = out_row_base0 + outw;
+              float *out_row0_2 = out_row_base0 + 2 * outw;
+
+              float *out_row_base1 = out_base1 + i * outw;
+              float *out_row1_0 = out_row_base1;
+              float *out_row1_1 = out_row_base1 + outw;
+              float *out_row1_2 = out_row_base1 + 2 * outw;
+
+              index_t j = 0;
+#if defined(MACE_ENABLE_NEON)
+              for (; j + 3 < w; j += 4) {
+              float32x4_t in_vec = vld1q_f32(in);
+
+              float32x4_t out00, out01, out02;
+              float32x4_t out10, out11, out12;
+              float32x4_t out20, out21, out22;
+
+              out00 = vld1q_f32(out_row0_0);
+              out00 = neon_vfma_lane_0(out00, in_vec, k00_vec);
+              vst1q_f32(out_row0_0, out00);
+
+              out01 = vld1q_f32(out_row0_0 + 1);
+              out01 = neon_vfma_lane_1(out01, in_vec, k00_vec);
+              vst1q_f32(out_row0_0 + 1, out01);
+
+              out02 = vld1q_f32(out_row0_0 + 2);
+              out02 = neon_vfma_lane_2(out02, in_vec, k00_vec);
+              vst1q_f32(out_row0_0 + 2, out02);
+
+              out10 = vld1q_f32(out_row0_1 + 0);
+              out10 = neon_vfma_lane_0(out10, in_vec, k01_vec);
+              vst1q_f32(out_row0_1 + 0, out10);
+
+              out11 = vld1q_f32(out_row0_1 + 1);
+              out11 = neon_vfma_lane_1(out11, in_vec, k01_vec);
+              vst1q_f32(out_row0_1 + 1, out11);
+
+              out12 = vld1q_f32(out_row0_1 + 2);
+              out12 = neon_vfma_lane_2(out12, in_vec, k01_vec);
+              vst1q_f32(out_row0_1 + 2, out12);
+
+              out20 = vld1q_f32(out_row0_2 + 0);
+              out20 = neon_vfma_lane_1(out20, in_vec, k02_vec);
+              vst1q_f32(out_row0_2 + 0, out20);
+
+              out21 = vld1q_f32(out_row0_2 + 1);
+              out21 = neon_vfma_lane_2(out21, in_vec, k02_vec);
+              vst1q_f32(out_row0_2 + 1, out21);
+
+              out22 = vld1q_f32(out_row0_2 + 2);
+              out22 = neon_vfma_lane_3(out22, in_vec, k02_vec);
+              vst1q_f32(out_row0_2 + 2, out22);
+
+              out00 = vld1q_f32(out_row1_0 + 0);
+              out00 = neon_vfma_lane_0(out00, in_vec, k10_vec);
+              vst1q_f32(out_row1_0 + 0, out00);
+
+              out01 = vld1q_f32(out_row1_0 + 1);
+              out01 = neon_vfma_lane_1(out01, in_vec, k10_vec);
+              vst1q_f32(out_row1_0 + 1, out01);
+
+              out02 = vld1q_f32(out_row1_0 + 2);
+              out02 = neon_vfma_lane_2(out02, in_vec, k10_vec);
+              vst1q_f32(out_row1_0 + 2, out02);
+
+              out10 = vld1q_f32(out_row1_1 + 0);
+              out10 = neon_vfma_lane_0(out10, in_vec, k11_vec);
+              vst1q_f32(out_row1_1 + 0, out10);
+
+              out11 = vld1q_f32(out_row1_1 + 1);
+              out11 = neon_vfma_lane_1(out11, in_vec, k11_vec);
+              vst1q_f32(out_row1_1 + 1, out11);
+
+              out12 = vld1q_f32(out_row1_1 + 2);
+              out12 = neon_vfma_lane_2(out12, in_vec, k11_vec);
+              vst1q_f32(out_row1_1 + 2, out12);
+
+              out20 = vld1q_f32(out_row1_2 + 0);
+              out20 = neon_vfma_lane_1(out20, in_vec, k12_vec);
+              vst1q_f32(out_row1_2 + 0, out20);
+
+              out21 = vld1q_f32(out_row1_2 + 1);
+              out21 = neon_vfma_lane_2(out21, in_vec, k12_vec);
+              vst1q_f32(out_row1_2 + 1, out21);
+
+              out22 = vld1q_f32(out_row1_2 + 2);
+              out22 = neon_vfma_lane_3(out22, in_vec, k12_vec);
+              vst1q_f32(out_row1_2 + 2, out22);
+
+              in += 4;
+              out_row0_0 += 4;
+              out_row0_1 += 4;
+              out_row0_2 += 4;
+              out_row1_0 += 4;
+              out_row1_1 += 4;
+              out_row1_2 += 4;
+            }
+#endif
+              for (; j < w; ++j) {
+                float val = in[0];
+                for (int k = 0; k < 3; ++k) {
+                  out_row0_0[k] += val * k0_0[k];
+                  out_row0_1[k] += val * k0_1[k];
+                  out_row0_2[k] += val * k0_2[k + 1];
+                  out_row1_0[k] += val * k1_0[k];
+                  out_row1_1[k] += val * k1_1[k];
+                  out_row1_2[k] += val * k1_2[k + 1];
+                }
+                in++;
+                out_row0_0++;
+                out_row0_1++;
+                out_row0_2++;
+                out_row1_0++;
+                out_row1_1++;
+                out_row1_2++;
+              }
+            }
+          }
+        } else {
+          const index_t out_offset = b * outch + outch_g * g + oc;
+          float *out_base0 = output + out_offset * out_img_size;
+          for (index_t ic = 0; ic < inch_g; ++ic) {
+            const index_t in_offset = (b * group + g) * inch_g + ic;
+            const float *input_base = input + in_offset * in_img_size;
+            const index_t kernel_offset = (oc * group + g) * inch_g + ic;
+            const float *kernel_base0 = filter + kernel_offset * 9;
+            const float *in = input_base;
+            const float *k0_0 = kernel_base0;
+            const float *k0_1 = kernel_base0 + 3;
+            const float *k0_2 = kernel_base0 + 5;
+
+#if defined(MACE_ENABLE_NEON)
+            // load filter
+          float32x4_t k00_vec = vld1q_f32(k0_0);
+          float32x4_t k01_vec = vld1q_f32(k0_1);
+          float32x4_t k02_vec = vld1q_f32(k0_2);
+#endif
+            for (index_t i = 0; i < h; ++i) {
+              float *out_row_base0 = out_base0 + i * outw;
+              float *out_row0_0 = out_row_base0;
+              float *out_row0_1 = out_row_base0 + outw;
+              float *out_row0_2 = out_row_base0 + 2 * outw;
+              index_t j = 0;
+#if defined(MACE_ENABLE_NEON)
+              for (; j + 3 < w; j += 4) {
+              float32x4_t in_vec = vld1q_f32(in);
+
+              float32x4_t out00, out01, out02;
+              float32x4_t out10, out11, out12;
+              float32x4_t out20, out21, out22;
+
+              out00 = vld1q_f32(out_row0_0 + 0);
+              out00 = neon_vfma_lane_0(out00, in_vec, k00_vec);
+              vst1q_f32(out_row0_0 + 0, out00);
+
+              out01 = vld1q_f32(out_row0_0 + 1);
+              out01 = neon_vfma_lane_1(out01, in_vec, k00_vec);
+              vst1q_f32(out_row0_0 + 1, out01);
+
+              out02 = vld1q_f32(out_row0_0 + 2);
+              out02 = neon_vfma_lane_2(out02, in_vec, k00_vec);
+              vst1q_f32(out_row0_0 + 2, out02);
+
+              out10 = vld1q_f32(out_row0_1 + 0);
+              out10 = neon_vfma_lane_0(out10, in_vec, k01_vec);
+              vst1q_f32(out_row0_1 + 0, out10);
+
+              out11 = vld1q_f32(out_row0_1 + 1);
+              out11 = neon_vfma_lane_1(out11, in_vec, k01_vec);
+              vst1q_f32(out_row0_1 + 1, out11);
+
+              out12 = vld1q_f32(out_row0_1 + 2);
+              out12 = neon_vfma_lane_2(out12, in_vec, k01_vec);
+              vst1q_f32(out_row0_1 + 2, out12);
+
+              out20 = vld1q_f32(out_row0_2 + 0);
+              out20 = neon_vfma_lane_1(out20, in_vec, k02_vec);
+              vst1q_f32(out_row0_2 + 0, out20);
+
+              out21 = vld1q_f32(out_row0_2 + 1);
+              out21 = neon_vfma_lane_2(out21, in_vec, k02_vec);
+              vst1q_f32(out_row0_2 + 1, out21);
+
+              out22 = vld1q_f32(out_row0_2 + 2);
+              out22 = neon_vfma_lane_3(out22, in_vec, k02_vec);
+              vst1q_f32(out_row0_2 + 2, out22);
+
+              in += 4;
+              out_row0_0 += 4;
+              out_row0_1 += 4;
+              out_row0_2 += 4;
+            }
+#endif
+              for (; j < w; ++j) {
+                float val = in[0];
+                for (int k = 0; k < 3; ++k) {
+                  out_row0_0[k] += val * k0_0[k];
+                  out_row0_1[k] += val * k0_1[k];
+                  out_row0_2[k] += val * k0_2[k + 1];
+                }
+                in++;
+                out_row0_0++;
+                out_row0_1++;
+                out_row0_2++;
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+void GroupDeconv2dNeonK3x3S2(const float *input,
+                             const float *filter,
+                             const int group,
+                             const index_t *in_shape,
+                             const index_t *out_shape,
+                             float *output) {
+  const index_t inch = in_shape[1];
+  const index_t h = in_shape[2];
+  const index_t w = in_shape[3];
+
+  const index_t outch = out_shape[1];
+  const index_t outh = out_shape[2];
+  const index_t outw = out_shape[3];
+
+  const index_t in_img_size = h * w;
+  const index_t out_img_size = outh * outw;
+
+  const index_t inch_g = inch / group;
+  const index_t outch_g = outch / group;
+
+#pragma omp parallel for collapse(3)
+  for (index_t b = 0; b < out_shape[0]; ++b) {
+    for (int g = 0; g < group; ++g) {
+      for (index_t oc = 0; oc < outch_g; ++oc) {
+        const index_t out_offset = b * outch + outch_g * g + oc;
+        float *out_base = output + out_offset * out_img_size;
+        for (index_t ic = 0; ic < inch_g; ++ic) {
+          const index_t in_offset = b * inch + inch_g * g + ic;
+          const float *input_base = input + in_offset * in_img_size;
+          const index_t kernel_offset = (oc * group + g) * inch_g + ic;
+          const float *kernel_base = filter + kernel_offset * 9;
+          const float *in = input_base;
+
+          const float *k0 = kernel_base;
+          const float *k1 = kernel_base + 3;
+          const float *k2 = kernel_base + 5;
+
+#if defined(MACE_ENABLE_NEON)
+          float32x4_t k0_vec = vld1q_f32(k0);
+        float32x4_t k1_vec = vld1q_f32(k1);
+        float32x4_t k2_vec = vld1q_f32(k2);
+#endif
+          for (index_t i = 0; i < h; ++i) {
+            float *out_row_base = out_base + i * 2 * outw;
+            float *out_row_0 = out_row_base;
+            float *out_row_1 = out_row_0 + outw;
+            float *out_row_2 = out_row_1 + outw;
+
+            index_t j = 0;
+#if defined(MACE_ENABLE_NEON)
+            for (; j + 3 < w; j += 4) {
+            float32x4_t in_vec = vld1q_f32(in);
+
+            // out row 0
+            float32x4x2_t out00 = vld2q_f32(out_row_0);
+            out00.val[0] =
+              neon_vfma_lane_0(out00.val[0], in_vec, k0_vec);
+            out00.val[1] =
+              neon_vfma_lane_1(out00.val[1], in_vec, k0_vec);
+            vst2q_f32(out_row_0, out00);
+
+            float32x4x2_t out01 = vld2q_f32(out_row_0 + 2);
+            out01.val[0] =
+              neon_vfma_lane_2(out01.val[0], in_vec, k0_vec);
+            vst2q_f32(out_row_0 + 2, out01);
+
+            // out row 1
+            float32x4x2_t out10 = vld2q_f32(out_row_1);
+            out10.val[0] =
+              neon_vfma_lane_0(out10.val[0], in_vec, k1_vec);
+            out10.val[1] =
+              neon_vfma_lane_1(out10.val[1], in_vec, k1_vec);
+            vst2q_f32(out_row_1, out10);
+
+            float32x4x2_t out11 = vld2q_f32(out_row_1 + 2);
+            out11.val[0] =
+              neon_vfma_lane_2(out11.val[0], in_vec, k1_vec);
+            vst2q_f32(out_row_1 + 2, out11);
+
+            // out row 2
+            float32x4x2_t out20 = vld2q_f32(out_row_2);
+            out20.val[0] =
+              neon_vfma_lane_1(out20.val[0], in_vec, k2_vec);
+            out20.val[1] =
+              neon_vfma_lane_2(out20.val[1], in_vec, k2_vec);
+            vst2q_f32(out_row_2, out20);
+
+            float32x4x2_t out21 = vld2q_f32(out_row_2 + 2);
+            out21.val[0] =
+              neon_vfma_lane_3(out21.val[0], in_vec, k2_vec);
+            vst2q_f32(out_row_2 + 2, out21);
+
+            in += 4;
+            out_row_0 += 8;
+            out_row_1 += 8;
+            out_row_2 += 8;
+          }
+#endif
+            for (; j < w; ++j) {
+              float val = in[0];
+
+              for (int k = 0; k < 3; ++k) {
+                out_row_0[k] += val * k0[k];
+                out_row_1[k] += val * k1[k];
+                out_row_2[k] += val * k2[k + 1];
+              }
+
+              in++;
+              out_row_0 += 2;
+              out_row_1 += 2;
+              out_row_2 += 2;
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+}  // namespace ops
+}  // namespace mace
diff --git a/mace/ops/arm/depthwise_deconv2d_neon_4x4.cc b/mace/ops/arm/depthwise_deconv2d_neon_4x4.cc
new file mode 100644
index 00000000..6ae7dbb1
--- /dev/null
+++ b/mace/ops/arm/depthwise_deconv2d_neon_4x4.cc
@@ -0,0 +1,805 @@
+// Copyright 2018 Xiaomi, Inc.  All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "mace/core/macros.h"
+#include "mace/ops/arm/deconv_2d_neon.h"
+
+namespace mace {
+namespace ops {
+
+void DepthwiseDeconv2dNeonK4x4S1(const float *input,
+                                 const float *filter,
+                                 const index_t *in_shape,
+                                 const index_t *out_shape,
+                                 float *output) {
+  const index_t batch = in_shape[0];
+  const index_t channels = in_shape[1];
+  const index_t w = in_shape[3];
+  const index_t h = in_shape[2];
+  const index_t in_img_size = h * w;
+
+  const index_t outh = out_shape[2];
+  const index_t outw = out_shape[3];
+  const index_t out_img_size = outh * outw;
+
+#pragma omp parallel for collapse(2)
+  for (int b = 0; b < batch; ++b) {
+    for (int c = 0; c < channels; ++c) {
+      const index_t offset = b * channels + c;
+      float *out_base = output + offset * out_img_size;
+      const float *input_base = input + offset * in_img_size;
+      const float *kernel_base = filter + c * 16;
+      const float *in = input_base;
+      const float *k0 = kernel_base;
+      const float *k1 = kernel_base + 4;
+      const float *k2 = kernel_base + 8;
+      const float *k3 = kernel_base + 12;
+#if defined(MACE_ENABLE_NEON)
+      float32x4_t k0_vec = vld1q_f32(k0);
+      float32x4_t k1_vec = vld1q_f32(k1);
+      float32x4_t k2_vec = vld1q_f32(k2);
+      float32x4_t k3_vec = vld1q_f32(k3);
+#endif
+      for (int i = 0; i < h; i++) {
+        float *out_row = out_base + i * outw;
+        float *out_row_0 = out_row;
+        float *out_row_1 = out_row_0 + outw;
+        float *out_row_2 = out_row_1 + outw;
+        float *out_row_3 = out_row_2 + outw;
+        int j = 0;
+#if defined(MACE_ENABLE_NEON)
+        for (; j + 3 < w; j += 4) {
+          float32x4_t in_vec = vld1q_f32(in);
+
+          float32x4_t out00 = vld1q_f32(out_row_0);
+          out00 = neon_vfma_lane_0(out00, in_vec, k0_vec);
+          vst1q_f32(out_row_0, out00);
+
+          float32x4_t out01 = vld1q_f32(out_row_0 + 1);
+          out01 = neon_vfma_lane_1(out01, in_vec, k0_vec);
+          vst1q_f32(out_row_0 + 1, out01);
+
+          float32x4_t out02 = vld1q_f32(out_row_0 + 2);
+          out02 = neon_vfma_lane_2(out02, in_vec, k0_vec);
+          vst1q_f32(out_row_0 + 2, out02);
+
+          float32x4_t out03 = vld1q_f32(out_row_0 + 3);
+          out03 = neon_vfma_lane_3(out03, in_vec, k0_vec);
+          vst1q_f32(out_row_0 + 3, out03);
+
+          //
+          float32x4_t out10 = vld1q_f32(out_row_1);
+          out10 = neon_vfma_lane_0(out10, in_vec, k1_vec);
+          vst1q_f32(out_row_1, out10);
+
+          float32x4_t out11 = vld1q_f32(out_row_1 + 1);
+          out11 = neon_vfma_lane_1(out11, in_vec, k1_vec);
+          vst1q_f32(out_row_1 + 1, out11);
+
+          float32x4_t out12 = vld1q_f32(out_row_1 + 2);
+          out12 = neon_vfma_lane_2(out12, in_vec, k1_vec);
+          vst1q_f32(out_row_1 + 2, out12);
+
+          float32x4_t out13 = vld1q_f32(out_row_1 + 3);
+          out13 = neon_vfma_lane_3(out13, in_vec, k1_vec);
+          vst1q_f32(out_row_1 + 3, out13);
+
+          //
+          float32x4_t out20 = vld1q_f32(out_row_2 + 0);
+          out20 = neon_vfma_lane_0(out20, in_vec, k2_vec);
+          vst1q_f32(out_row_2 + 0, out20);
+
+          float32x4_t out21 = vld1q_f32(out_row_2 + 1);
+          out21 = neon_vfma_lane_1(out21, in_vec, k2_vec);
+          vst1q_f32(out_row_2 + 1, out21);
+
+          float32x4_t out22 = vld1q_f32(out_row_2 + 2);
+          out22 = neon_vfma_lane_2(out22, in_vec, k2_vec);
+          vst1q_f32(out_row_2 + 2, out22);
+
+          float32x4_t out23 = vld1q_f32(out_row_2 + 3);
+          out23 = neon_vfma_lane_3(out23, in_vec, k2_vec);
+          vst1q_f32(out_row_2 + 3, out23);
+
+          //
+          float32x4_t out30 = vld1q_f32(out_row_3 + 0);
+          out30 = neon_vfma_lane_0(out30, in_vec, k3_vec);
+          vst1q_f32(out_row_3 + 0, out30);
+
+          float32x4_t out31 = vld1q_f32(out_row_3 + 1);
+          out31 = neon_vfma_lane_1(out31, in_vec, k3_vec);
+          vst1q_f32(out_row_3 + 1, out31);
+
+          float32x4_t out32 = vld1q_f32(out_row_3 + 2);
+          out32 = neon_vfma_lane_2(out32, in_vec, k3_vec);
+          vst1q_f32(out_row_3 + 2, out32);
+
+          float32x4_t out33 = vld1q_f32(out_row_3 + 3);
+          out33 = neon_vfma_lane_3(out33, in_vec, k3_vec);
+          vst1q_f32(out_row_3 + 3, out33);
+
+          in += 4;
+          out_row_0 += 4;
+          out_row_1 += 4;
+          out_row_2 += 4;
+          out_row_3 += 4;
+        }
+#endif
+        for (; j < w; j++) {
+          float val = in[0];
+          for (int k = 0; k < 4; ++k) {
+            out_row_0[k] += val * k0[k];
+            out_row_1[k] += val * k1[k];
+            out_row_2[k] += val * k2[k];
+            out_row_3[k] += val * k3[k];
+          }
+          in++;
+          out_row_0++;
+          out_row_1++;
+          out_row_2++;
+          out_row_3++;
+        }
+      }
+    }
+  }
+}
+
+void DepthwiseDeconv2dNeonK4x4S2(const float *input,
+                                 const float *filter,
+                                 const index_t *in_shape,
+                                 const index_t *out_shape,
+                                 float *output) {
+  const index_t w = in_shape[3];
+  const index_t h = in_shape[2];
+  const index_t channels = in_shape[1];
+  const index_t in_img_size = h * w;
+
+  const index_t outh = out_shape[2];
+  const index_t outw = out_shape[3];
+  const index_t out_img_size = outh * outw;
+
+#pragma omp parallel for collapse(2)
+  for (int b = 0; b < out_shape[0]; ++b) {
+    for (int c = 0; c < channels; ++c) {
+      const index_t offset = b * channels + c;
+      float *out_base = output + offset * out_img_size;
+      const float *input_base = input + offset * in_img_size;
+      const float *kernel_base = filter + c * 16;
+      const float *in = input_base;
+
+      const float *k0 = kernel_base;
+      const float *k1 = kernel_base + 4;
+      const float *k2 = kernel_base + 8;
+      const float *k3 = kernel_base + 12;
+#if defined(MACE_ENABLE_NEON)
+      float32x4_t k0_vec = vld1q_f32(k0);
+      float32x4_t k1_vec = vld1q_f32(k1);
+      float32x4_t k2_vec = vld1q_f32(k2);
+      float32x4_t k3_vec = vld1q_f32(k3);
+#endif
+      for (int i = 0; i < h; i++) {
+        float *out_row = out_base + 2 * i * outw;
+
+        float *out_row_0 = out_row;
+        float *out_row_1 = out_row_0 + outw;
+        float *out_row_2 = out_row_1 + outw;
+        float *out_row_3 = out_row_2 + outw;
+
+        int j = 0;
+#if defined(MACE_ENABLE_NEON)
+        for (; j + 3 < w; j += 4) {
+          float32x4_t in_vec = vld1q_f32(in);
+
+          // row 0
+          float32x4x2_t out0 = vld2q_f32(out_row_0);
+          out0.val[0] =
+            neon_vfma_lane_0(out0.val[0], in_vec, k0_vec);
+          out0.val[1] =
+            neon_vfma_lane_1(out0.val[1], in_vec, k0_vec);
+          vst2q_f32(out_row_0, out0);
+          out0 = vld2q_f32(out_row_0 + 2);
+          out0.val[0] =
+            neon_vfma_lane_2(out0.val[0], in_vec, k0_vec);
+          out0.val[1] =
+            neon_vfma_lane_3(out0.val[1], in_vec, k0_vec);
+          vst2q_f32(out_row_0 + 2, out0);
+
+          // row 1
+          float32x4x2_t out1 = vld2q_f32(out_row_1);
+          out1.val[0] =
+            neon_vfma_lane_0(out1.val[0], in_vec, k1_vec);
+          out1.val[1] =
+            neon_vfma_lane_1(out1.val[1], in_vec, k1_vec);
+          vst2q_f32(out_row_1, out1);
+          out1 = vld2q_f32(out_row_1 + 2);
+          out1.val[0] =
+            neon_vfma_lane_2(out1.val[0], in_vec, k1_vec);
+          out1.val[1] =
+            neon_vfma_lane_3(out1.val[1], in_vec, k1_vec);
+          vst2q_f32(out_row_1 + 2, out1);
+
+          // row 2
+          float32x4x2_t out2 = vld2q_f32(out_row_2);
+          out2.val[0] =
+            neon_vfma_lane_0(out2.val[0], in_vec, k2_vec);
+          out2.val[1] =
+            neon_vfma_lane_1(out2.val[1], in_vec, k2_vec);
+          vst2q_f32(out_row_2, out2);
+          out2 = vld2q_f32(out_row_2 + 2);
+          out2.val[0] =
+            neon_vfma_lane_2(out2.val[0], in_vec, k2_vec);
+          out2.val[1] =
+            neon_vfma_lane_3(out2.val[1], in_vec, k2_vec);
+          vst2q_f32(out_row_2 + 2, out2);
+
+          // row 3
+          float32x4x2_t out3 = vld2q_f32(out_row_3);
+          out3.val[0] =
+            neon_vfma_lane_0(out3.val[0], in_vec, k3_vec);
+          out3.val[1] =
+            neon_vfma_lane_1(out3.val[1], in_vec, k3_vec);
+          vst2q_f32(out_row_3, out3);
+          out3 = vld2q_f32(out_row_3 + 2);
+          out3.val[0] =
+            neon_vfma_lane_2(out3.val[0], in_vec, k3_vec);
+          out3.val[1] =
+            neon_vfma_lane_3(out3.val[1], in_vec, k3_vec);
+          vst2q_f32(out_row_3 + 2, out3);
+
+          in += 4;
+          out_row_0 += 8;
+          out_row_1 += 8;
+          out_row_2 += 8;
+          out_row_3 += 8;
+        }
+#endif
+        for (; j < w; j++) {
+          float val = in[0];
+          for (int k = 0; k < 4; ++k) {
+            out_row_0[k] += val * k0[k];
+            out_row_1[k] += val * k1[k];
+            out_row_2[k] += val * k2[k];
+            out_row_3[k] += val * k3[k];
+          }
+          in++;
+          out_row_0 += 2;
+          out_row_1 += 2;
+          out_row_2 += 2;
+          out_row_3 += 2;
+        }
+      }
+    }
+  }
+}
+
+void GroupDeconv2dNeonK4x4S1(const float *input,
+                             const float *filter,
+                             const int group,
+                             const index_t *in_shape,
+                             const index_t *out_shape,
+                             float *output) {
+  const index_t w = in_shape[3];
+  const index_t h = in_shape[2];
+  const index_t inch = in_shape[1];
+
+  const index_t outh = out_shape[2];
+  const index_t outw = out_shape[3];
+  const index_t outch = out_shape[1];
+
+  const index_t in_img_size = h * w;
+  const index_t out_img_size = outh * outw;
+
+  const index_t inch_g = inch / group;
+  const index_t outch_g = outch / group;
+
+#pragma omp parallel for collapse(3)
+  for (int b = 0; b < out_shape[0]; ++b) {
+    for (int g = 0; g < group; ++g) {
+      for (int oc = 0; oc < outch_g; oc += 2) {
+        if (oc + 1 < outch_g) {
+          const index_t out_offset =
+              (b * outch + outch_g * g + oc) * out_img_size;
+          float *out_base = output + out_offset;
+          float *out_base1 = out_base + out_img_size;
+          for (int ic = 0; ic < inch_g; ic++) {
+            const index_t in_offset =
+                (b * inch + inch_g * g + ic) * in_img_size;
+            const float *input_base = input + in_offset;
+            const float *in = input_base;
+            const index_t kernel_offset =
+                ((oc * group + g) * inch_g  + ic) * 16;
+            const float *kernel_base = filter + kernel_offset;
+            const float *k0 = kernel_base;
+            const float *k1 = kernel_base + 4;
+            const float *k2 = kernel_base + 8;
+            const float *k3 = kernel_base + 12;
+
+            const float *kernel_base1 = kernel_base + inch * 16;
+            const float *k10 = kernel_base1;
+            const float *k11 = kernel_base1 + 4;
+            const float *k12 = kernel_base1 + 8;
+            const float *k13 = kernel_base1 + 12;
+#if defined(MACE_ENABLE_NEON)
+            float32x4_t k0_vec = vld1q_f32(k0);
+            float32x4_t k1_vec = vld1q_f32(k1);
+            float32x4_t k2_vec = vld1q_f32(k2);
+            float32x4_t k3_vec = vld1q_f32(k3);
+
+            float32x4_t k10_vec = vld1q_f32(k10);
+            float32x4_t k11_vec = vld1q_f32(k11);
+            float32x4_t k12_vec = vld1q_f32(k12);
+            float32x4_t k13_vec = vld1q_f32(k13);
+#endif
+            for (int i = 0; i < h; i++) {
+              float *out_row = out_base + i * outw;
+
+              float *out_row_0 = out_row;
+              float *out_row_1 = out_row_0 + outw;
+              float *out_row_2 = out_row_1 + outw;
+              float *out_row_3 = out_row_2 + outw;
+
+              float *out_row1 = out_base1 + i * outw;
+
+              float *out_row1_0 = out_row1;
+              float *out_row1_1 = out_row1_0 + outw;
+              float *out_row1_2 = out_row1_1 + outw;
+              float *out_row1_3 = out_row1_2 + outw;
+
+              int j = 0;
+#if defined(MACE_ENABLE_NEON)
+              for (; j + 3 < w; j += 4) {
+              float32x4_t in_vec = vld1q_f32(in);
+              float32x4_t out00, out01, out02, out03;
+              float32x4_t out10, out11, out12, out13;
+
+              out00 = vld1q_f32(out_row_0);
+              out00 = neon_vfma_lane_0(out00, in_vec, k0_vec);
+              vst1q_f32(out_row_0, out00);
+
+              out10 = vld1q_f32(out_row1_0);
+              out10 = neon_vfma_lane_0(out10, in_vec, k10_vec);
+              vst1q_f32(out_row1_0, out10);
+
+              out01 = vld1q_f32(out_row_0 + 1);
+              out01 = neon_vfma_lane_1(out01, in_vec, k0_vec);
+              vst1q_f32(out_row_0 + 1, out01);
+
+              out11 = vld1q_f32(out_row1_0 + 1);
+              out11 = neon_vfma_lane_1(out11, in_vec, k10_vec);
+              vst1q_f32(out_row1_0 + 1, out11);
+
+              out02 = vld1q_f32(out_row_0 + 2);
+              out02 = neon_vfma_lane_2(out02, in_vec, k0_vec);
+              vst1q_f32(out_row_0 + 2, out02);
+
+              out12 = vld1q_f32(out_row1_0 + 2);
+              out12 = neon_vfma_lane_2(out12, in_vec, k10_vec);
+              vst1q_f32(out_row1_0 + 2, out12);
+
+              out03 = vld1q_f32(out_row_0 + 3);
+              out03 = neon_vfma_lane_3(out03, in_vec, k0_vec);
+              vst1q_f32(out_row_0 + 3, out03);
+
+              out13 = vld1q_f32(out_row1_0 + 3);
+              out13 = neon_vfma_lane_3(out13, in_vec, k10_vec);
+              vst1q_f32(out_row1_0 + 3, out13);
+
+              //
+              out00 = vld1q_f32(out_row_1);
+              out00 = neon_vfma_lane_0(out00, in_vec, k1_vec);
+              vst1q_f32(out_row_1, out00);
+
+              out10 = vld1q_f32(out_row1_1);
+              out10 = neon_vfma_lane_0(out10, in_vec, k11_vec);
+              vst1q_f32(out_row1_1, out10);
+
+              out01 = vld1q_f32(out_row_1 + 1);
+              out01 = neon_vfma_lane_1(out01, in_vec, k1_vec);
+              vst1q_f32(out_row_1 + 1, out01);
+
+              out11 = vld1q_f32(out_row1_1 + 1);
+              out11 = neon_vfma_lane_1(out11, in_vec, k11_vec);
+              vst1q_f32(out_row1_1 + 1, out11);
+
+              out02 = vld1q_f32(out_row_1 + 2);
+              out02 = neon_vfma_lane_2(out02, in_vec, k1_vec);
+              vst1q_f32(out_row_1 + 2, out02);
+
+              out12 = vld1q_f32(out_row1_1 + 2);
+              out12 = neon_vfma_lane_2(out12, in_vec, k11_vec);
+              vst1q_f32(out_row1_1 + 2, out12);
+
+              out03 = vld1q_f32(out_row_1 + 3);
+              out03 = neon_vfma_lane_3(out03, in_vec, k1_vec);
+              vst1q_f32(out_row_1 + 3, out03);
+
+              out13 = vld1q_f32(out_row1_1 + 3);
+              out13 = neon_vfma_lane_3(out13, in_vec, k11_vec);
+              vst1q_f32(out_row1_1 + 3, out13);
+
+              //
+              out00 = vld1q_f32(out_row_2 + 0);
+              out00 = neon_vfma_lane_0(out00, in_vec, k2_vec);
+              vst1q_f32(out_row_2 + 0, out00);
+
+              out10 = vld1q_f32(out_row1_2 + 0);
+              out10 = neon_vfma_lane_0(out10, in_vec, k12_vec);
+              vst1q_f32(out_row1_2 + 0, out10);
+
+              out01 = vld1q_f32(out_row_2 + 1);
+              out01 = neon_vfma_lane_1(out01, in_vec, k2_vec);
+              vst1q_f32(out_row_2 + 1, out01);
+
+              out11 = vld1q_f32(out_row1_2 + 1);
+              out11 = neon_vfma_lane_1(out11, in_vec, k12_vec);
+              vst1q_f32(out_row1_2 + 1, out11);
+
+              out02 = vld1q_f32(out_row_2 + 2);
+              out02 = neon_vfma_lane_2(out02, in_vec, k2_vec);
+              vst1q_f32(out_row_2 + 2, out02);
+
+              out12 = vld1q_f32(out_row1_2 + 2);
+              out12 = neon_vfma_lane_2(out12, in_vec, k12_vec);
+              vst1q_f32(out_row1_2 + 2, out12);
+
+              out03 = vld1q_f32(out_row_2 + 3);
+              out03 = neon_vfma_lane_3(out03, in_vec, k2_vec);
+              vst1q_f32(out_row_2 + 3, out03);
+
+              out13 = vld1q_f32(out_row1_2 + 3);
+              out13 = neon_vfma_lane_3(out13, in_vec, k12_vec);
+              vst1q_f32(out_row1_2 + 3, out13);
+
+              //
+              out00 = vld1q_f32(out_row_3 + 0);
+              out00 = neon_vfma_lane_0(out00, in_vec, k3_vec);
+              vst1q_f32(out_row_3 + 0, out00);
+
+              out10 = vld1q_f32(out_row1_3 + 0);
+              out10 = neon_vfma_lane_0(out10, in_vec, k13_vec);
+              vst1q_f32(out_row1_3 + 0, out10);
+
+              out01 = vld1q_f32(out_row_3 + 1);
+              out01 = neon_vfma_lane_1(out01, in_vec, k3_vec);
+              vst1q_f32(out_row_3 + 1, out01);
+
+              out11 = vld1q_f32(out_row1_3 + 1);
+              out11 = neon_vfma_lane_1(out11, in_vec, k13_vec);
+              vst1q_f32(out_row1_3 + 1, out11);
+
+              out02 = vld1q_f32(out_row_3 + 2);
+              out02 = neon_vfma_lane_2(out02, in_vec, k3_vec);
+              vst1q_f32(out_row_3 + 2, out02);
+
+              out12 = vld1q_f32(out_row1_3 + 2);
+              out12 = neon_vfma_lane_2(out12, in_vec, k13_vec);
+              vst1q_f32(out_row1_3 + 2, out12);
+
+              out03 = vld1q_f32(out_row_3 + 3);
+              out03 = neon_vfma_lane_3(out03, in_vec, k3_vec);
+              vst1q_f32(out_row_3 + 3, out03);
+
+              out13 = vld1q_f32(out_row1_3 + 3);
+              out13 = neon_vfma_lane_3(out13, in_vec, k13_vec);
+              vst1q_f32(out_row1_3 + 3, out13);
+
+              in += 4;
+              out_row_0 += 4;
+              out_row_1 += 4;
+              out_row_2 += 4;
+              out_row_3 += 4;
+              out_row1_0 += 4;
+              out_row1_1 += 4;
+              out_row1_2 += 4;
+              out_row1_3 += 4;
+            }
+#endif
+              for (; j < w; j++) {
+                float val = in[0];
+                for (int k = 0; k < 4; ++k) {
+                  out_row_0[k] += val * k0[k];
+                  out_row_1[k] += val * k1[k];
+                  out_row_2[k] += val * k2[k];
+                  out_row_3[k] += val * k3[k];
+                  out_row1_0[k] += val * k10[k];
+                  out_row1_1[k] += val * k11[k];
+                  out_row1_2[k] += val * k12[k];
+                  out_row1_3[k] += val * k13[k];
+                }
+                in++;
+                out_row_0++;
+                out_row_1++;
+                out_row_2++;
+                out_row_3++;
+                out_row1_0++;
+                out_row1_1++;
+                out_row1_2++;
+                out_row1_3++;
+              }
+            }
+          }
+        } else {
+          const index_t out_offset =
+              (b * outch + outch_g * g + oc) * out_img_size;
+          float *out_base = output + out_offset;
+          for (int ic = 0; ic < inch_g; ++ic) {
+            const index_t in_offset =
+                (b * inch + inch_g * g + ic) * in_img_size;
+            const index_t kernel_offset =
+                ((oc * group + g) * inch_g  + ic) * 16;
+
+            const float *input_base = input + in_offset;
+            const float *kernel_base = filter + kernel_offset;
+            const float *in = input_base;
+            const float *k0 = kernel_base;
+            const float *k1 = kernel_base + 4;
+            const float *k2 = kernel_base + 8;
+            const float *k3 = kernel_base + 12;
+#if defined(MACE_ENABLE_NEON)
+            float32x4_t k0_vec = vld1q_f32(k0);
+            float32x4_t k1_vec = vld1q_f32(k1);
+            float32x4_t k2_vec = vld1q_f32(k2);
+            float32x4_t k3_vec = vld1q_f32(k3);
+#endif
+            for (int i = 0; i < h; i++) {
+              float *out_row = out_base + i * outw;
+              float *out_row_0 = out_row;
+              float *out_row_1 = out_row_0 + outw;
+              float *out_row_2 = out_row_1 + outw;
+              float *out_row_3 = out_row_2 + outw;
+              int j = 0;
+#if defined(MACE_ENABLE_NEON)
+              for (; j + 3 < w; j += 4) {
+              float32x4_t in_vec = vld1q_f32(in);
+
+              float32x4_t out00 = vld1q_f32(out_row_0);
+              out00 = neon_vfma_lane_0(out00, in_vec, k0_vec);
+              vst1q_f32(out_row_0, out00);
+
+              float32x4_t out01 = vld1q_f32(out_row_0 + 1);
+              out01 = neon_vfma_lane_1(out01, in_vec, k0_vec);
+              vst1q_f32(out_row_0 + 1, out01);
+
+              float32x4_t out02 = vld1q_f32(out_row_0 + 2);
+              out02 = neon_vfma_lane_2(out02, in_vec, k0_vec);
+              vst1q_f32(out_row_0 + 2, out02);
+
+              float32x4_t out03 = vld1q_f32(out_row_0 + 3);
+              out03 = neon_vfma_lane_3(out03, in_vec, k0_vec);
+              vst1q_f32(out_row_0 + 3, out03);
+
+              //
+              float32x4_t out10 = vld1q_f32(out_row_1);
+              out10 = neon_vfma_lane_0(out10, in_vec, k1_vec);
+              vst1q_f32(out_row_1, out10);
+
+              float32x4_t out11 = vld1q_f32(out_row_1 + 1);
+              out11 = neon_vfma_lane_1(out11, in_vec, k1_vec);
+              vst1q_f32(out_row_1 + 1, out11);
+
+              float32x4_t out12 = vld1q_f32(out_row_1 + 2);
+              out12 = neon_vfma_lane_2(out12, in_vec, k1_vec);
+              vst1q_f32(out_row_1 + 2, out12);
+
+              float32x4_t out13 = vld1q_f32(out_row_1 + 3);
+              out13 = neon_vfma_lane_3(out13, in_vec, k1_vec);
+              vst1q_f32(out_row_1 + 3, out13);
+
+              //
+              float32x4_t out20 = vld1q_f32(out_row_2 + 0);
+              out20 = neon_vfma_lane_0(out20, in_vec, k2_vec);
+              vst1q_f32(out_row_2 + 0, out20);
+
+              float32x4_t out21 = vld1q_f32(out_row_2 + 1);
+              out21 = neon_vfma_lane_1(out21, in_vec, k2_vec);
+              vst1q_f32(out_row_2 + 1, out21);
+
+              float32x4_t out22 = vld1q_f32(out_row_2 + 2);
+              out22 = neon_vfma_lane_2(out22, in_vec, k2_vec);
+              vst1q_f32(out_row_2 + 2, out22);
+
+              float32x4_t out23 = vld1q_f32(out_row_2 + 3);
+              out23 = neon_vfma_lane_3(out23, in_vec, k2_vec);
+              vst1q_f32(out_row_2 + 3, out23);
+
+              //
+              float32x4_t out30 = vld1q_f32(out_row_3 + 0);
+              out30 = neon_vfma_lane_0(out30, in_vec, k3_vec);
+              vst1q_f32(out_row_3 + 0, out30);
+
+              float32x4_t out31 = vld1q_f32(out_row_3 + 1);
+              out31 = neon_vfma_lane_1(out31, in_vec, k3_vec);
+              vst1q_f32(out_row_3 + 1, out31);
+
+              float32x4_t out32 = vld1q_f32(out_row_3 + 2);
+              out32 = neon_vfma_lane_2(out32, in_vec, k3_vec);
+              vst1q_f32(out_row_3 + 2, out32);
+
+              float32x4_t out33 = vld1q_f32(out_row_3 + 3);
+              out33 = neon_vfma_lane_3(out33, in_vec, k3_vec);
+              vst1q_f32(out_row_3 + 3, out33);
+
+              in += 4;
+              out_row_0 += 4;
+              out_row_1 += 4;
+              out_row_2 += 4;
+              out_row_3 += 4;
+            }
+#endif
+              for (; j < w; j++) {
+                float val = in[0];
+                for (int k = 0; k < 4; ++k) {
+                  out_row_0[k] += val * k0[k];
+                  out_row_1[k] += val * k1[k];
+                  out_row_2[k] += val * k2[k];
+                  out_row_3[k] += val * k3[k];
+                }
+                in++;
+                out_row_0++;
+                out_row_1++;
+                out_row_2++;
+                out_row_3++;
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+void GroupDeconv2dNeonK4x4S2(const float *input,
+                             const float *filter,
+                             const int group,
+                             const index_t *in_shape,
+                             const index_t *out_shape,
+                             float *output) {
+  const index_t w = in_shape[3];
+  const index_t h = in_shape[2];
+  const index_t inch = in_shape[1];
+
+  const index_t outh = out_shape[2];
+  const index_t outw = out_shape[3];
+  const index_t outch = out_shape[1];
+  const index_t in_img_size = h * w;
+  const index_t out_img_size = outh * outw;
+
+  const index_t inch_g = inch / group;
+  const index_t outch_g = outch / group;
+
+#pragma omp parallel for collapse(3)
+  for (int b = 0; b < out_shape[0]; ++b) {
+    for (int g = 0; g < group; ++g) {
+      for (int oc = 0; oc < outch_g; oc++) {
+        const index_t out_offset =
+            (b * outch + outch_g * g + oc) * out_img_size;
+        float *out_base = output + out_offset;
+        for (int ic = 0; ic < inch_g; ic++) {
+          const index_t in_offset =
+              (b * inch + inch_g * g + ic) * in_img_size;
+          const index_t kernel_offset =
+              ((oc * group + g) * inch_g  + ic) * 16;
+          const float *input_base = input + in_offset;
+          const float *kernel_base = filter + kernel_offset;
+          const float *in = input_base;
+
+          const float *k0 = kernel_base;
+          const float *k1 = kernel_base + 4;
+          const float *k2 = kernel_base + 8;
+          const float *k3 = kernel_base + 12;
+#if defined(MACE_ENABLE_NEON)
+          float32x4_t k0_vec = vld1q_f32(k0);
+          float32x4_t k1_vec = vld1q_f32(k1);
+          float32x4_t k2_vec = vld1q_f32(k2);
+          float32x4_t k3_vec = vld1q_f32(k3);
+#endif
+          for (int i = 0; i < h; i++) {
+            float *out_row = out_base + 2 * i * outw;
+
+            float *out_row_0 = out_row;
+            float *out_row_1 = out_row_0 + outw;
+            float *out_row_2 = out_row_1 + outw;
+            float *out_row_3 = out_row_2 + outw;
+
+            int j = 0;
+#if defined(MACE_ENABLE_NEON)
+            for (; j + 3 < w; j += 4) {
+            float32x4_t in_vec = vld1q_f32(in);
+
+            // row 0
+            float32x4x2_t out0 = vld2q_f32(out_row_0);
+            out0.val[0] =
+              neon_vfma_lane_0(out0.val[0], in_vec, k0_vec);
+            out0.val[1] =
+              neon_vfma_lane_1(out0.val[1], in_vec, k0_vec);
+            vst2q_f32(out_row_0, out0);
+            out0 = vld2q_f32(out_row_0 + 2);
+            out0.val[0] =
+              neon_vfma_lane_2(out0.val[0], in_vec, k0_vec);
+            out0.val[1] =
+              neon_vfma_lane_3(out0.val[1], in_vec, k0_vec);
+            vst2q_f32(out_row_0 + 2, out0);
+
+            // row 1
+            float32x4x2_t out1 = vld2q_f32(out_row_1);
+            out1.val[0] =
+              neon_vfma_lane_0(out1.val[0], in_vec, k1_vec);
+            out1.val[1] =
+              neon_vfma_lane_1(out1.val[1], in_vec, k1_vec);
+            vst2q_f32(out_row_1, out1);
+            out1 = vld2q_f32(out_row_1 + 2);
+            out1.val[0] =
+              neon_vfma_lane_2(out1.val[0], in_vec, k1_vec);
+            out1.val[1] =
+              neon_vfma_lane_3(out1.val[1], in_vec, k1_vec);
+            vst2q_f32(out_row_1 + 2, out1);
+
+            // row 2
+            float32x4x2_t out2 = vld2q_f32(out_row_2);
+            out2.val[0] =
+              neon_vfma_lane_0(out2.val[0], in_vec, k2_vec);
+            out2.val[1] =
+              neon_vfma_lane_1(out2.val[1], in_vec, k2_vec);
+            vst2q_f32(out_row_2, out2);
+            out2 = vld2q_f32(out_row_2 + 2);
+            out2.val[0] =
+              neon_vfma_lane_2(out2.val[0], in_vec, k2_vec);
+            out2.val[1] =
+              neon_vfma_lane_3(out2.val[1], in_vec, k2_vec);
+            vst2q_f32(out_row_2 + 2, out2);
+
+            // row 3
+            float32x4x2_t out3 = vld2q_f32(out_row_3);
+            out3.val[0] =
+              neon_vfma_lane_0(out3.val[0], in_vec, k3_vec);
+            out3.val[1] =
+              neon_vfma_lane_1(out3.val[1], in_vec, k3_vec);
+            vst2q_f32(out_row_3, out3);
+            out3 = vld2q_f32(out_row_3 + 2);
+            out3.val[0] =
+              neon_vfma_lane_2(out3.val[0], in_vec, k3_vec);
+            out3.val[1] =
+              neon_vfma_lane_3(out3.val[1], in_vec, k3_vec);
+            vst2q_f32(out_row_3 + 2, out3);
+
+            in += 4;
+            out_row_0 += 8;
+            out_row_1 += 8;
+            out_row_2 += 8;
+            out_row_3 += 8;
+          }
+#endif
+            for (; j < w; j++) {
+              float val = in[0];
+              for (int k = 0; k < 4; ++k) {
+                out_row_0[k] += val * k0[k];
+                out_row_1[k] += val * k1[k];
+                out_row_2[k] += val * k2[k];
+                out_row_3[k] += val * k3[k];
+              }
+              in++;
+              out_row_0 += 2;
+              out_row_1 += 2;
+              out_row_2 += 2;
+              out_row_3 += 2;
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+}  // namespace ops
+}  // namespace mace
diff --git a/mace/ops/deconv_2d.cc b/mace/ops/deconv_2d.cc
index 0bfa8200..001b38a9 100644
--- a/mace/ops/deconv_2d.cc
+++ b/mace/ops/deconv_2d.cc
@@ -359,12 +359,12 @@ class Deconv2dOp<DeviceType::CPU, float> : public Deconv2dOpBase {
                 padded_out_shape.data(),
                 out_data);
     if (!no_pad) {
-      CropPadOut(out_data,
-                 padded_out_shape.data(),
-                 output_shape.data(),
-                 pad_h,
-                 pad_w,
-                 output_data);
+      CropPadOut<float>(out_data,
+                        padded_out_shape.data(),
+                        output_shape.data(),
+                        pad_h,
+                        pad_w,
+                        output_data);
     }
 
     if (bias_data != nullptr) {
@@ -445,33 +445,6 @@ class Deconv2dOp<DeviceType::CPU, float> : public Deconv2dOpBase {
       }
     }
   }
-
-  void CropPadOut(const float *input,
-                  const index_t *in_shape,
-                  const index_t *out_shape,
-                  const index_t pad_h,
-                  const index_t pad_w,
-                  float *output) {
-    const index_t batch = in_shape[0];
-    const index_t channel = in_shape[1];
-    const index_t in_height = in_shape[2];
-    const index_t in_width = in_shape[3];
-
-    const index_t out_height = out_shape[2];
-    const index_t out_width = out_shape[3];
-#pragma omp parallel for collapse(3)
-    for (int i = 0; i < batch; ++i) {
-      for (int j = 0; j < channel; ++j) {
-        for (int k = 0; k < out_height; ++k) {
-          const float *input_base =
-              input + ((i * channel + j) * in_height + (k + pad_h)) * in_width;
-          float *output_base =
-              output + ((i * channel + j) * out_height + k)* out_width;
-          memcpy(output_base, input_base + pad_w, out_width * sizeof(float));
-        }
-      }
-    }
-  }
 };
 
 #ifdef MACE_ENABLE_OPENCL
diff --git a/mace/ops/deconv_2d.h b/mace/ops/deconv_2d.h
index 35dcee8b..1af7362b 100644
--- a/mace/ops/deconv_2d.h
+++ b/mace/ops/deconv_2d.h
@@ -15,6 +15,8 @@
 #ifndef MACE_OPS_DECONV_2D_H_
 #define MACE_OPS_DECONV_2D_H_
 
+#include "mace/core/types.h"
+
 namespace mace {
 namespace ops {
 
@@ -23,6 +25,34 @@ enum FrameworkType {
   CAFFE = 1,
 };
 
+template <typename T>
+void CropPadOut(const T *input,
+                const index_t *in_shape,
+                const index_t *out_shape,
+                const index_t pad_h,
+                const index_t pad_w,
+                T *output) {
+  const index_t batch = in_shape[0];
+  const index_t channel = in_shape[1];
+  const index_t in_height = in_shape[2];
+  const index_t in_width = in_shape[3];
+
+  const index_t out_height = out_shape[2];
+  const index_t out_width = out_shape[3];
+#pragma omp parallel for collapse(3)
+  for (int i = 0; i < batch; ++i) {
+    for (int j = 0; j < channel; ++j) {
+      for (int k = 0; k < out_height; ++k) {
+        const T *input_base =
+            input + ((i * channel + j) * in_height + (k + pad_h)) * in_width;
+        T *output_base =
+            output + ((i * channel + j) * out_height + k)* out_width;
+        memcpy(output_base, input_base + pad_w, out_width * sizeof(T));
+      }
+    }
+  }
+}
+
 }  // namespace ops
 }  // namespace mace
 
diff --git a/mace/ops/depthwise_deconv2d.cc b/mace/ops/depthwise_deconv2d.cc
new file mode 100644
index 00000000..a4b2ba1c
--- /dev/null
+++ b/mace/ops/depthwise_deconv2d.cc
@@ -0,0 +1,547 @@
+// Copyright 2018 Xiaomi, Inc.  All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "mace/ops/deconv_2d.h"
+
+#if defined(MACE_ENABLE_NEON)
+#include <arm_neon.h>
+#endif
+
+#include <algorithm>
+#include <functional>
+#include <memory>
+#include <vector>
+
+#include "mace/core/future.h"
+#include "mace/core/operator.h"
+#include "mace/core/tensor.h"
+#include "mace/ops/activation.h"
+#include "mace/ops/arm/depthwise_deconv2d_neon.h"
+#include "mace/ops/conv_pool_2d_util.h"
+#include "mace/utils/utils.h"
+#include "mace/public/mace.h"
+#ifdef MACE_ENABLE_OPENCL
+#include "mace/ops/opencl/image/depthwise_deconv2d.h"
+#endif  // MACE_ENABLE_OPENCL
+
+namespace mace {
+namespace ops {
+
+class DepthwiseDeconv2dOpBase : public Operation {
+ public:
+  explicit DepthwiseDeconv2dOpBase(OpConstructContext *context)
+      : Operation(context),
+        strides_(Operation::GetRepeatedArgs<int>("strides")),
+        paddings_(Operation::GetRepeatedArgs<int>("padding_values")),
+        group_(Operation::GetOptionalArg<int>("group", 1)),
+        activation_(ops::StringToActivationType(
+            Operation::GetOptionalArg<std::string>("activation",
+                                                      "NOOP"))),
+        relux_max_limit_(Operation::GetOptionalArg<float>("max_limit",
+                                                             0.0f)) {}
+
+  static void CalcGroupDeconvOutputSize(
+      const index_t *input_shape,   // NHWC
+      const index_t *filter_shape,  // OIHW
+      const int group,
+      const int *strides,
+      const int *paddings,
+      int *pre_paddings,
+      index_t *out_shape,
+      index_t *padded_out_shape,
+      const bool isNCHW = false) {
+    MACE_CHECK_NOTNULL(paddings);
+    MACE_CHECK_NOTNULL(input_shape);
+    MACE_CHECK_NOTNULL(filter_shape);
+    MACE_CHECK_NOTNULL(strides);
+
+    const index_t in_height = isNCHW ? input_shape[2] : input_shape[1];
+    const index_t in_width = isNCHW ? input_shape[3] : input_shape[2];
+
+    const index_t output_channel = filter_shape[0] * group;
+
+    const index_t kernel_h = filter_shape[2];
+    const index_t kernel_w = filter_shape[3];
+
+    index_t padded_out_height =
+        (in_height - 1) * strides[0] + kernel_h;
+    index_t padded_out_width =
+        (in_width - 1) * strides[1] + kernel_w;
+
+    if (pre_paddings != nullptr) {
+      pre_paddings[0] = static_cast<int>((kernel_h - 1) * 2 - paddings[0]);
+      pre_paddings[1] = static_cast<int>((kernel_w - 1) * 2 - paddings[1]);
+      pre_paddings[0] = std::max<int>(0, pre_paddings[0]);
+      pre_paddings[1] = std::max<int>(0, pre_paddings[1]);
+    }
+
+    if (padded_out_shape != nullptr) {
+      padded_out_shape[0] = input_shape[0];
+      padded_out_shape[1] = isNCHW ? output_channel : padded_out_height;
+      padded_out_shape[2] = isNCHW ? padded_out_height : padded_out_width;
+      padded_out_shape[3] = isNCHW ? padded_out_width : output_channel;
+    }
+
+    if (out_shape != nullptr) {
+      index_t out_height = padded_out_height - paddings[0];
+      index_t out_width = padded_out_width - paddings[1];
+      out_shape[0] = input_shape[0];
+      out_shape[1] = isNCHW ? output_channel : out_height;
+      out_shape[2] = isNCHW ? out_height : out_width;
+      out_shape[3] = isNCHW ? out_width : output_channel;
+    }
+  }
+
+ protected:
+  std::vector<int> strides_;  // [stride_h, stride_w]
+  std::vector<int> paddings_;
+  const int group_;
+  const ActivationType activation_;
+  const float relux_max_limit_;
+};
+
+
+
+template <DeviceType D, class T>
+class DepthwiseDeconv2dOp;
+
+template<>
+class DepthwiseDeconv2dOp<DeviceType::CPU, float>
+    : public DepthwiseDeconv2dOpBase {
+ public:
+  explicit DepthwiseDeconv2dOp(OpConstructContext *context)
+      : DepthwiseDeconv2dOpBase(context) {}
+
+  MaceStatus Run(OpContext *context) override {
+    const Tensor *input = this->Input(0);
+    const Tensor *filter = this->Input(1);
+    const Tensor *bias = this->InputSize() >= 3 ? this->Input(2) : nullptr;
+    Tensor *output = this->Output(0);
+
+    MACE_CHECK_NOTNULL(input);
+    MACE_CHECK_NOTNULL(filter);
+    MACE_CHECK_NOTNULL(output);
+
+    std::vector<int> out_paddings(2, 0);
+    std::vector<index_t> out_shape(4, 0);
+    std::vector<index_t> padded_out_shape(4, 0);
+
+    if (!paddings_.empty()) out_paddings = paddings_;
+    CalcGroupDeconvOutputSize(input->shape().data(),
+                              filter->shape().data(),
+                              group_,
+                              strides_.data(),
+                              out_paddings.data(),
+                              nullptr,
+                              out_shape.data(),
+                              padded_out_shape.data(),
+                              true);
+    MACE_RETURN_IF_ERROR(output->Resize(out_shape));
+    output->Clear();
+    index_t kernel_h = filter->dim(2);
+    index_t kernel_w = filter->dim(3);
+
+    Tensor::MappingGuard input_mapper(input);
+    Tensor::MappingGuard filter_mapper(filter);
+    Tensor::MappingGuard bias_mapper(bias);
+    Tensor::MappingGuard output_mapper(output);
+    auto input_data = input->data<float>();
+    auto filter_data = filter->data<float>();
+    auto bias_data = bias == nullptr ? nullptr : bias->data<float>();
+
+    auto output_data = output->mutable_data<float>();
+
+    const index_t pad_left = out_paddings[0] / 2;
+    const index_t pad_top = out_paddings[1] / 2;
+
+    index_t padded_out_size =
+        std::accumulate(padded_out_shape.begin(),
+                        padded_out_shape.end(),
+                        1,
+                        std::multiplies<index_t>()) * sizeof(float);
+    ScratchBuffer *scratch = context->device()->scratch_buffer();
+    scratch->Rewind();
+    scratch->GrowSize(padded_out_size);
+    Tensor padded_out(scratch->Scratch(padded_out_size), DT_FLOAT);
+    padded_out.Reshape(padded_out_shape);
+    padded_out.Clear();
+    auto *padded_out_data = padded_out.mutable_data<float>();
+
+    const index_t in_channels = input->dim(1);
+    const index_t out_channels = output->dim(1);
+
+    bool no_pad = paddings_[0] == 0 && paddings_[1] == 0;
+    float *out_data = no_pad ? output_data : padded_out_data;
+
+    bool use_neon_3x3_s1 = kernel_h == kernel_w && kernel_h == 3 &&
+        strides_[0] == strides_[1] && strides_[0] == 1;
+    bool use_neon_3x3_s2 = kernel_h == kernel_w && kernel_h == 3 &&
+        strides_[0] == strides_[1] && strides_[0] == 2;
+    bool use_neon_4x4_s1 = kernel_h == kernel_w && kernel_h == 4 &&
+        strides_[0] == strides_[1] && strides_[0] == 1;
+    bool use_neon_4x4_s2 = kernel_h == kernel_w && kernel_h == 4 &&
+        strides_[0] == strides_[1] && strides_[0] == 2;
+
+    bool is_depthwise = (group_ == in_channels && group_ == out_channels);
+
+    std::function<void(const float *input,
+                       const float *filter,
+                       const int group,
+                       const index_t *in_shape,
+                       const index_t *out_shape,
+                       float *output)> kernel_func;
+
+    if (use_neon_3x3_s1) {
+      kernel_func = [=](const float *input,
+                        const float *filter,
+                        const int group,
+                        const index_t *in_shape,
+                        const index_t *padded_out_shape,
+                        float *padded_output) {
+        if (is_depthwise) {
+          DepthwiseDeconv2dNeonK3x3S1(input,
+                                      filter,
+                                      in_shape,
+                                      padded_out_shape,
+                                      padded_output);
+        } else {
+          GroupDeconv2dNeonK3x3S1(input,
+                                  filter,
+                                  group,
+                                  in_shape,
+                                  padded_out_shape,
+                                  padded_output);
+        }
+      };
+    } else if (use_neon_3x3_s2) {
+      kernel_func = [=](const float *input,
+                        const float *filter,
+                        const int group,
+                        const index_t *in_shape,
+                        const index_t *padded_out_shape,
+                        float *padded_output) {
+        if (is_depthwise) {
+          DepthwiseDeconv2dNeonK3x3S2(input,
+                                      filter,
+                                      in_shape,
+                                      padded_out_shape,
+                                      padded_output);
+        } else {
+          GroupDeconv2dNeonK3x3S2(input,
+                                  filter,
+                                  group,
+                                  in_shape,
+                                  padded_out_shape,
+                                  padded_output);
+        }
+      };
+    } else if (use_neon_4x4_s1) {
+      kernel_func = [=](const float *input,
+                        const float *filter,
+                        const int group,
+                        const index_t *in_shape,
+                        const index_t *padded_out_shape,
+                        float *padded_output) {
+        if (is_depthwise) {
+          DepthwiseDeconv2dNeonK4x4S1(input,
+                                      filter,
+                                      in_shape,
+                                      padded_out_shape,
+                                      padded_output);
+        } else {
+          GroupDeconv2dNeonK4x4S1(input,
+                                  filter,
+                                  group,
+                                  in_shape,
+                                  padded_out_shape,
+                                  padded_output);
+        }
+      };
+    } else if (use_neon_4x4_s2) {
+      kernel_func = [=](const float *input,
+                        const float *filter,
+                        const int group,
+                        const index_t *in_shape,
+                        const index_t *padded_out_shape,
+                        float *padded_output) {
+        if (is_depthwise) {
+          DepthwiseDeconv2dNeonK4x4S2(input,
+                                      filter,
+                                      in_shape,
+                                      padded_out_shape,
+                                      padded_output);
+        } else {
+          GroupDeconv2dNeonK4x4S2(input,
+                                  filter,
+                                  group,
+                                  in_shape,
+                                  padded_out_shape,
+                                  padded_output);
+        }
+      };
+    } else {
+      kernel_func = [=](const float *input,
+                        const float *filter,
+                        const int group,
+                        const index_t *in_shape,
+                        const index_t *padded_out_shape,
+                        float *padded_output) {
+        if (is_depthwise) {
+          DepthwiseDeconv2dGeneral(input,
+                                   filter,
+                                   kernel_h,
+                                   kernel_w,
+                                   strides_.data(),
+                                   in_shape,
+                                   padded_out_shape,
+                                   padded_output);
+        } else {
+          GroupDeconv2dGeneral(input,
+                               filter,
+                               kernel_h,
+                               kernel_w,
+                               strides_.data(),
+                               group,
+                               in_shape,
+                               padded_out_shape,
+                               padded_output);
+        }
+      };
+    }
+
+    kernel_func(input_data,
+                filter_data,
+                group_,
+                input->shape().data(),
+                padded_out_shape.data(),
+                out_data);
+
+
+    if (!no_pad) {
+      CropPadOut<float>(out_data,
+                        padded_out_shape.data(),
+                        out_shape.data(),
+                        pad_left,
+                        pad_top,
+                        output_data);
+    }
+
+    if (bias_data != nullptr) {
+      const index_t batch = out_shape[0];
+      const index_t channels = out_shape[1];
+      const index_t img_size = out_shape[2] * out_shape[3];
+#pragma omp parallel for collapse(3)
+      for (index_t b = 0; b < batch; ++b) {
+        for (index_t c = 0; c < channels; ++c) {
+          for (index_t i = 0; i < img_size; ++i) {
+            output_data[(b * channels + c) * img_size + i] +=
+                bias_data[c];
+          }
+        }
+      }
+    }
+
+    DoActivation<float>(output_data,
+                        output_data,
+                        output->size(),
+                        activation_,
+                        relux_max_limit_);
+
+    return MaceStatus::MACE_SUCCESS;
+  }
+
+ private:
+  void DepthwiseDeconv2dGeneral(const float *input,
+                                const float *filter,
+                                const index_t kernel_h,
+                                const index_t kernel_w,
+                                const int *strides,
+                                const index_t *in_shape,
+                                const index_t *out_shape,
+                                float *output) {
+    const index_t batch = in_shape[0];
+    const index_t out_height = out_shape[2];
+    const index_t out_width = out_shape[3];
+
+    const index_t channels = in_shape[1];
+    const index_t in_height = in_shape[2];
+    const index_t in_width = in_shape[3];
+
+    const index_t out_img_size = out_height * out_width;
+    const index_t in_img_size = in_height * in_width;
+
+    const int kernel_size = kernel_h * kernel_w;
+    std::vector<int> index_map(kernel_size, 0);
+    for (int i = 0; i < kernel_h; ++i) {
+      for (int j = 0; j < kernel_w; ++j) {
+        index_map[i * kernel_w + j] = i * out_width + j;
+      }
+    }
+
+#pragma omp parallel for collapse(2)
+    for (int b = 0; b < batch; ++b) {
+      for (int c = 0; c < channels; ++c) {
+        float *out_base =
+            output + (b * channels + c) * out_img_size;
+        for (int i = 0; i < in_height; ++i) {
+          for (int j = 0; j < in_width; ++j) {
+            const index_t out_offset =
+                i * strides[0] * out_width + j * strides[1];
+            const index_t input_idx =
+                (b * channels + c) * in_img_size + i * in_width + j;
+            const float val = input[input_idx];
+            const index_t kernel_offset = c * kernel_size;
+            for (int k = 0; k < kernel_size; ++k) {
+              const index_t out_idx = out_offset + index_map[k];
+              const index_t kernel_idx = kernel_offset + k;
+              out_base[out_idx] += val * filter[kernel_idx];
+            }
+          }
+        }
+      }
+    }
+  }
+
+  void GroupDeconv2dGeneral(const float *input,
+                            const float *filter,
+                            const index_t kernel_h,
+                            const index_t kernel_w,
+                            const int *strides,
+                            const int group,
+                            const index_t *in_shape,
+                            const index_t *out_shape,
+                            float *output) {
+    const index_t out_channels = out_shape[1];
+    const index_t out_height = out_shape[2];
+    const index_t out_width = out_shape[3];
+
+    const index_t in_channels = in_shape[1];
+    const index_t in_height = in_shape[2];
+    const index_t in_width = in_shape[3];
+
+    MACE_CHECK(in_channels % group == 0 && out_channels % group == 0,
+               "invalid input/output channel and group.");
+
+    const index_t out_img_size = out_height * out_width;
+    const index_t in_img_size = in_height * in_width;
+
+    const int kernel_size = kernel_h * kernel_w;
+    std::vector<int> index_map(kernel_size, 0);
+    for (int i = 0; i < kernel_h; ++i) {
+      for (int j = 0; j < kernel_w; ++j) {
+        index_map[i * kernel_w + j] = i * out_width + j;
+      }
+    }
+
+    const int in_channels_g = in_channels / group;
+    const int out_channels_g = out_channels / group;
+#pragma omp parallel for collapse(3)
+    for (int b = 0; b < in_shape[0]; ++b) {
+      for (int g = 0; g < group; ++g) {
+        for (int p = 0; p < out_channels_g; ++p) {
+          const index_t out_base =
+              ((b * group + g) * out_channels_g + p) * out_img_size;
+          for (int i = 0; i < in_height; ++i) {
+            for (int j = 0; j < in_width; ++j) {
+              const index_t out_offset =
+                  i * strides[0] * out_width + j * strides[1];
+              for (int q = 0; q < in_channels_g; ++q) {
+                const  index_t in_base =
+                    ((b * group + g) * in_channels_g + q) * in_img_size;
+                const index_t in_offset =
+                    in_base + i * in_width + j;
+                const float val = input[in_offset];
+                const index_t k_offset =
+                    ((p * group + g) * in_channels_g + q) * kernel_size;
+                for (int k = 0; k < kernel_size; ++k) {
+                  const index_t out_idx = out_base + out_offset + index_map[k];
+                  const float w = filter[k_offset + k];
+                  output[out_idx] += val * w;
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+};
+
+#ifdef MACE_ENABLE_OPENCL
+template <typename T>
+class DepthwiseDeconv2dOp<DeviceType::GPU, T> : public DepthwiseDeconv2dOpBase {
+ public:
+  explicit DepthwiseDeconv2dOp(OpConstructContext *context)
+      : DepthwiseDeconv2dOpBase(context) {
+    if (context->device()->opencl_runtime()->UseImageMemory()) {
+      kernel_.reset(new opencl::image::DepthwiseDeconv2dKernel<T>);
+    } else {
+      MACE_NOT_IMPLEMENTED;
+    }
+  }
+
+  MaceStatus Run(OpContext *context) override {
+    const Tensor *input = this->Input(0);
+    const Tensor *filter = this->Input(1);
+    const Tensor *bias = this->InputSize() >= 3 ? this->Input(2) : nullptr;
+    Tensor *output = this->Output(0);
+    MACE_CHECK_NOTNULL(input);
+    MACE_CHECK_NOTNULL(filter);
+    MACE_CHECK_NOTNULL(output);
+
+    std::vector<int> in_paddings(2, 0);
+    std::vector<index_t> out_shape(4, 0);
+
+    CalcGroupDeconvOutputSize(input->shape().data(),
+                              filter->shape().data(),
+                              group_,
+                              strides_.data(),
+                              paddings_.data(),
+                              in_paddings.data(),
+                              out_shape.data(),
+                              nullptr);
+
+    return kernel_->Compute(context,
+                            input,
+                            filter,
+                            bias,
+                            strides_.data(),
+                            in_paddings.data(),
+                            group_,
+                            activation_,
+                            relux_max_limit_,
+                            out_shape,
+                            output);
+  }
+
+ private:
+  std::unique_ptr<OpenCLDepthwiseDeconv2dKernel> kernel_;
+};
+#endif  // MACE_ENABLE_OPENCL
+
+void RegisterDepthwiseDeconv2d(OpRegistryBase *op_registry) {
+  MACE_REGISTER_OP(op_registry, "DepthwiseDeconv2d",
+                   DepthwiseDeconv2dOp, DeviceType::CPU, float);
+
+#ifdef MACE_ENABLE_OPENCL
+  MACE_REGISTER_OP(op_registry, "DepthwiseDeconv2d",
+                   DepthwiseDeconv2dOp, DeviceType::GPU, float);
+
+  MACE_REGISTER_OP(op_registry, "DepthwiseDeconv2d",
+                   DepthwiseDeconv2dOp, DeviceType::GPU, half);
+#endif  // MACE_ENABLE_OPENCL
+}
+
+}  // namespace ops
+}  // namespace mace
diff --git a/mace/ops/depthwise_deconv2d_benchmark.cc b/mace/ops/depthwise_deconv2d_benchmark.cc
new file mode 100644
index 00000000..3e3da26f
--- /dev/null
+++ b/mace/ops/depthwise_deconv2d_benchmark.cc
@@ -0,0 +1,130 @@
+// Copyright 2018 Xiaomi, Inc.  All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <algorithm>
+
+#include "mace/core/operator.h"
+#include "mace/core/testing/test_benchmark.h"
+#include "mace/ops/ops_test_util.h"
+
+namespace mace {
+namespace ops {
+namespace test {
+
+template <DeviceType D, typename T>
+static void DepthwiseDeconv2d(int iters,
+                              int batch,
+                              int channels,
+                              int height,
+                              int width,
+                              int kernel_h,
+                              int kernel_w,
+                              int stride,
+                              int padding) {
+  mace::testing::StopTiming();
+
+  OpsTestNet net;
+
+  // Add input data
+  if (D == DeviceType::CPU) {
+    net.AddRandomInput<D, float>("Input", {batch, channels, height, width});
+  } else {
+    net.AddRandomInput<D, float>("Input", {batch, height, width, channels});
+  }
+  net.AddRandomInput<D, float>("Filter",
+                               {1, channels, kernel_h,
+                                kernel_w});
+  if (D == DeviceType::GPU) {
+    BufferToImage<D, T>(&net, "Input", "InputImage",
+                        ops::BufferType::IN_OUT_CHANNEL);
+    BufferToImage<D, T>(&net, "Filter", "FilterImage",
+                        ops::BufferType::DW_CONV2D_FILTER);
+    OpDefBuilder("DepthwiseDeconv2d", "DepthwiseDeconv2dTest")
+        .Input("InputImage")
+        .Input("FilterImage")
+        .Output("Output")
+        .AddIntsArg("strides", {stride, stride})
+        .AddIntsArg("padding_values", {padding, padding})
+        .AddIntArg("group", channels)
+        .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
+        .Finalize(net.NewOperatorDef());
+  } else {
+    OpDefBuilder("DepthwiseDeconv2d", "DepthwiseDeconv2dTest")
+        .Input("Input")
+        .Input("Filter")
+        .Output("Output")
+        .AddIntsArg("strides", {stride, stride})
+        .AddIntsArg("padding_values", {padding, padding})
+        .AddIntArg("group", channels)
+        .AddIntArg("T", static_cast<int>(DataTypeToEnum<float>::value))
+        .Finalize(net.NewOperatorDef());
+  }
+
+  net.Setup(D);
+
+  // Warm-up
+  for (int i = 0; i < 2; ++i) {
+    net.Run();
+    net.Sync();
+  }
+
+  mace::testing::StartTiming();
+  while (iters--) {
+    net.Run();
+    net.Sync();
+  }
+}
+
+// In common network, there are usually more than 1 layers, this is used to
+// approximate the amortized latency. The OpenCL runtime for Mali/Adreno is
+// in-order.
+
+#define MACE_BM_DEPTHWISE_DECONV2D_MACRO(                    \
+    N, C, H, W, KH, KW, S, P, TYPE, DEVICE)                  \
+  static void                                                                 \
+    MACE_BM_DEPTHWISE_DECONV2D_##N##_##C##_##H##_##W##_##KH##_##KW##_##S##_##P\
+        ##_##TYPE##_##DEVICE(                                                 \
+          int iters) {                                                        \
+    const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W;          \
+    const int64_t macc =                                                      \
+        static_cast<int64_t>(iters) * N * H * W * KH * KW * C;   \
+    mace::testing::MaccProcessed(macc);                                       \
+    mace::testing::BytesProcessed(tot *(sizeof(TYPE)));                       \
+    DepthwiseDeconv2d<DEVICE, TYPE>(iters, N, C, H, W, KH, KW, S, P);        \
+  }                                                                           \
+  MACE_BENCHMARK(                                                             \
+    MACE_BM_DEPTHWISE_DECONV2D_##N##_##C##_##H##_##W##_##KH##_##KW##_##S##_##P\
+        ##_##TYPE##_##DEVICE)
+
+#define MACE_BM_DEPTHWISE_DECONV2D(N, C, H, W, KH, KW, S, P)              \
+  MACE_BM_DEPTHWISE_DECONV2D_MACRO(N, C, H, W, KH, KW, S, P, float, CPU); \
+  MACE_BM_DEPTHWISE_DECONV2D_MACRO(N, C, H, W, KH, KW, S, P, float, GPU); \
+  MACE_BM_DEPTHWISE_DECONV2D_MACRO(N, C, H, W, KH, KW, S, P, half, GPU);
+
+MACE_BM_DEPTHWISE_DECONV2D(1, 128, 15, 15, 1, 1, 1, 0);
+MACE_BM_DEPTHWISE_DECONV2D(1, 32, 60, 60, 1, 1, 1, 0);
+
+MACE_BM_DEPTHWISE_DECONV2D(1, 32, 60, 60, 3, 3, 1, 0);
+
+MACE_BM_DEPTHWISE_DECONV2D(1, 128, 60, 60, 4, 4, 1, 0);
+MACE_BM_DEPTHWISE_DECONV2D(1, 3, 224, 224, 4, 4, 2, 0);
+MACE_BM_DEPTHWISE_DECONV2D(1, 3, 512, 512, 7, 7, 2, 0);
+MACE_BM_DEPTHWISE_DECONV2D(1, 128, 16, 16, 5, 5, 1, 0);
+
+MACE_BM_DEPTHWISE_DECONV2D(1, 64, 32, 32, 1, 1, 1, 0);
+MACE_BM_DEPTHWISE_DECONV2D(1, 64, 33, 32, 3, 3, 2, 0);
+MACE_BM_DEPTHWISE_DECONV2D(1, 3, 224, 224, 3, 3, 2, 0);
+}  // namespace test
+}  // namespace ops
+}  // namespace mace
diff --git a/mace/ops/depthwise_deconv2d_test.cc b/mace/ops/depthwise_deconv2d_test.cc
new file mode 100644
index 00000000..b1f36845
--- /dev/null
+++ b/mace/ops/depthwise_deconv2d_test.cc
@@ -0,0 +1,286 @@
+// Copyright 2018 Xiaomi, Inc.  All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <fstream>
+#include <vector>
+
+#include "mace/ops/ops_test_util.h"
+
+namespace mace {
+namespace ops {
+namespace test {
+
+class DepthwiseDeconv2dOpTest : public OpsTestBase {};
+
+namespace {
+template <DeviceType D>
+void RunTestSimple(const int group,
+                   const std::vector<index_t> &input_shape,
+                   const std::vector<float> &input_data,
+                   const std::vector<float> &bias_data,
+                   const int stride,
+                   const std::vector<int> &paddings,
+                   const std::vector<index_t> &filter_shape,
+                   const std::vector<float> &filter_data,
+                   const std::vector<index_t> &expected_shape,
+                   const std::vector<float> &expected_data) {
+  OpsTestNet net;
+  // Add input data
+  net.AddInputFromArray<D, float>("Input", input_shape, input_data);
+  net.AddInputFromArray<D, float>("Filter", filter_shape, filter_data);
+  net.TransformDataFormat<D, float>("Filter", HWOI, "FilterOIHW", OIHW);
+  const index_t out_channels = expected_shape[3];
+  net.AddInputFromArray<D, float>("Bias", {out_channels}, bias_data);
+
+  if (D == DeviceType::GPU) {
+    BufferToImage<D, float>(&net, "Input", "InputImage",
+                            ops::BufferType::IN_OUT_CHANNEL);
+    BufferToImage<D, float>(&net, "FilterOIHW", "FilterImage",
+                            ops::BufferType::DW_CONV2D_FILTER);
+    BufferToImage<D, float>(&net, "Bias", "BiasImage",
+                            ops::BufferType::ARGUMENT);
+    OpDefBuilder("DepthwiseDeconv2d", "DepthwiseDeconv2dTest")
+        .Input("InputImage")
+        .Input("FilterImage")
+        .Input("BiasImage")
+        .Output("OutputImage")
+        .AddIntsArg("strides", {stride, stride})
+        .AddIntArg("group", group)
+        .AddIntsArg("padding_values", paddings)
+        .Finalize(net.NewOperatorDef());
+
+    net.RunOp(D);
+
+    // Transfer output
+    ImageToBuffer<D, float>(&net, "OutputImage", "Output",
+                            ops::BufferType::IN_OUT_CHANNEL);
+  } else {
+    net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC,
+                                                    "InputNCHW", NCHW);
+    OpDefBuilder("DepthwiseDeconv2d", "DepthwiseDeconv2dTest")
+        .Input("InputNCHW")
+        .Input("FilterOIHW")
+        .Input("Bias")
+        .Output("OutputNCHW")
+        .AddIntArg("group", group)
+        .AddIntsArg("strides", {stride, stride})
+        .AddIntsArg("padding_values", paddings)
+        .Finalize(net.NewOperatorDef());
+    // Run
+    net.RunOp(D);
+    net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW,
+                                                    "Output", NHWC);
+  }
+
+  auto expected = net.CreateTensor<float>(expected_shape, expected_data);
+  ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 0.0001);
+}
+
+template <DeviceType D>
+void TestNHWCSimple3x3_DW() {
+  RunTestSimple<D>(3,
+                   {1, 3, 3, 3},
+                   {1, 1, 1, 1, 1, 1, 1, 1, 1,
+                    1, 1, 1, 1, 1, 1, 1, 1, 1,
+                    1, 1, 1, 1, 1, 1, 1, 1, 1},
+                   {0, 0, 0},
+                   1, {0, 0},
+                   {3, 3, 1, 3},
+                   {1, 1, 1, 1, 1, 1, 1, 1, 1,
+                    1, 1, 1, 1, 1, 1, 1, 1, 1,
+                    1, 1, 1, 1, 1, 1, 1, 1, 1},
+                   {1, 5, 5, 3},
+                   {1, 1, 1, 2, 2, 2, 3, 3, 3, 2, 2, 2, 1, 1, 1,
+                    2, 2, 2, 4, 4, 4, 6, 6, 6, 4, 4, 4, 2, 2, 2,
+                    3, 3, 3, 6, 6, 6, 9, 9, 9, 6, 6, 6, 3, 3, 3,
+                    2, 2, 2, 4, 4, 4, 6, 6, 6, 4, 4, 4, 2, 2, 2,
+                    1, 1, 1, 2, 2, 2, 3, 3, 3, 2, 2, 2, 1, 1, 1});
+}
+
+template <DeviceType D>
+void TestNHWCSimple3x3_Group() {
+  RunTestSimple<D>(2,
+                   {1, 3, 3, 4},
+                   {1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4,
+                    1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4,
+                    1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4},
+                   {0, 0, 0, 0, 0, 0},
+                   1, {0, 0},
+                   {3, 3, 3, 4},
+                   {1, 1, 1, 1, 2, 2, 2, 2, 1, 1, 1, 1,
+                    1, 1, 1, 1, 2, 2, 2, 2, 1, 1, 1, 1,
+                    1, 1, 1, 1, 2, 2, 2, 2, 1, 1, 1, 1,
+                    1, 1, 1, 1, 2, 2, 2, 2, 1, 1, 1, 1,
+                    1, 1, 1, 1, 2, 2, 2, 2, 1, 1, 1, 1,
+                    1, 1, 1, 1, 2, 2, 2, 2, 1, 1, 1, 1,
+                    1, 1, 1, 1, 2, 2, 2, 2, 1, 1, 1, 1,
+                    1, 1, 1, 1, 2, 2, 2, 2, 1, 1, 1, 1,
+                    1, 1, 1, 1, 2, 2, 2, 2, 1, 1, 1, 1},
+                   {1, 5, 5, 6},
+                   {3, 6, 3, 7, 14, 7,
+                    6, 12, 6, 14, 28, 14,
+                    9, 18, 9, 21, 42, 21,
+                    6, 12, 6, 14, 28, 14,
+                    3, 6, 3, 7, 14, 7,
+                    6, 12, 6, 14, 28, 14,
+                    12, 24, 12, 28, 56, 28,
+                    18, 36, 18, 42, 84, 42,
+                    12, 24, 12, 28, 56, 28,
+                    6, 12, 6, 14, 28, 14,
+                    9, 18, 9, 21, 42, 21,
+                    18, 36, 18, 42, 84, 42,
+                    27, 54, 27, 63, 126, 63,
+                    18, 36, 18, 42, 84, 42,
+                    9, 18, 9, 21, 42, 21,
+                    6, 12, 6, 14, 28, 14,
+                    12, 24, 12, 28, 56, 28,
+                    18, 36, 18, 42, 84, 42,
+                    12, 24, 12, 28, 56, 28,
+                    6, 12, 6, 14, 28, 14,
+                    3, 6, 3, 7, 14, 7,
+                    6, 12, 6, 14, 28, 14,
+                    9, 18, 9, 21, 42, 21,
+                    6, 12, 6, 14, 28, 14,
+                    3, 6, 3, 7, 14, 7});
+}
+}  // namespace
+
+TEST_F(DepthwiseDeconv2dOpTest, CPUSimple3X3Depthwise) {
+  TestNHWCSimple3x3_DW<DeviceType::CPU>();
+}
+
+TEST_F(DepthwiseDeconv2dOpTest, CPUSimple3X3Group) {
+TestNHWCSimple3x3_Group<DeviceType::CPU>();
+}
+
+TEST_F(DepthwiseDeconv2dOpTest, GPUSimple3X3Depthwise) {
+TestNHWCSimple3x3_DW<DeviceType::GPU>();
+}
+
+namespace {
+template <typename T>
+void RandomTest(index_t batch,
+                      index_t channel,
+                      index_t height,
+                      index_t width,
+                      index_t kernel,
+                      int stride,
+                      int padding) {
+  testing::internal::LogToStderr();
+  // Construct graph
+  OpsTestNet net;
+  int multiplier = 1;
+
+  // Add input data
+  std::vector<float> input_data(batch * height * width * channel);
+  GenerateRandomRealTypeData({batch, height, width, channel}, &input_data);
+  net.AddInputFromArray<DeviceType::GPU, float>("Input",
+                                                {batch,
+                                                 height,
+                                                 width,
+                                                 channel},
+                                                input_data);
+  std::vector<float> filter_data(kernel * kernel * channel * multiplier);
+  GenerateRandomRealTypeData({multiplier, channel, kernel, kernel},
+                             &filter_data);
+  net.AddInputFromArray<DeviceType::GPU, float>(
+      "Filter", {multiplier, channel, kernel, kernel}, filter_data);
+  std::vector<float> bias_data(channel * multiplier);
+  GenerateRandomRealTypeData({channel * multiplier}, &bias_data);
+  net.AddInputFromArray<DeviceType::GPU, float>("Bias",
+                                                {channel * multiplier},
+                                                bias_data);
+
+  net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
+                                                  NCHW);
+  OpDefBuilder("DepthwiseDeconv2d", "DepthwiseDeconv2dTest")
+      .Input("InputNCHW")
+      .Input("Filter")
+      .Input("Bias")
+      .Output("OutputNCHW")
+      .AddIntsArg("strides", {stride, stride})
+      .AddIntsArg("padding_values", {padding, padding})
+      .AddIntArg("group", channel)
+      .AddIntsArg("dilations", {1, 1})
+      .AddIntArg("T", static_cast<int>(DataTypeToEnum<float>::value))
+      .Finalize(net.NewOperatorDef());
+  // Run
+  net.RunOp(DeviceType::CPU);
+  net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW,
+                                                  "Output", NHWC);
+
+
+  // Check
+  auto expected = net.CreateTensor<float>();
+  expected->Copy(*net.GetOutput("Output"));
+
+
+  BufferToImage<DeviceType::GPU, T>(&net, "Input", "InputImage",
+                      ops::BufferType::IN_OUT_CHANNEL);
+  BufferToImage<DeviceType::GPU, T>(&net, "Filter", "FilterImage",
+                      ops::BufferType::DW_CONV2D_FILTER);
+  BufferToImage<DeviceType::GPU, T>(&net, "Bias", "BiasImage",
+                      ops::BufferType::ARGUMENT);
+  OpDefBuilder("DepthwiseDeconv2d", "DepthwiseDeconv2dTest")
+      .Input("InputImage")
+      .Input("FilterImage")
+      .Input("BiasImage")
+      .Output("OutputImage")
+      .AddIntsArg("strides", {stride, stride})
+      .AddIntsArg("padding_values", {padding, padding})
+      .AddIntArg("group", channel)
+      .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
+      .Finalize(net.NewOperatorDef());
+
+  net.RunOp(DeviceType::GPU);
+
+  // Transfer output
+  ImageToBuffer<DeviceType::GPU, float>(&net, "OutputImage", "OPENCLOutput",
+                      ops::BufferType::IN_OUT_CHANNEL);
+
+  if (DataTypeToEnum<T>::value == DT_FLOAT) {
+    ExpectTensorNear<float>(*expected, *net.GetOutput("OPENCLOutput"), 1e-5);
+  } else {
+    ExpectTensorNear<float>(*expected, *net.GetOutput("OPENCLOutput"), 1e-2);
+  }
+}
+
+TEST_F(DepthwiseDeconv2dOpTest, RandomTestFloat) {
+  RandomTest<float>(1, 32, 256, 256, 5, 1, 2);
+  RandomTest<float>(1, 3, 256, 256, 5, 1, 1);
+  RandomTest<float>(1, 3, 256, 256, 5, 2, 2);
+  RandomTest<float>(1, 3, 256, 256, 5, 1, 3);
+  RandomTest<float>(1, 3, 256, 256, 5, 2, 4);
+  RandomTest<float>(1, 4, 256, 256, 5, 1, 1);
+  RandomTest<float>(1, 4, 256, 256, 5, 2, 2);
+  RandomTest<float>(1, 4, 256, 256, 5, 1, 3);
+  RandomTest<float>(1, 4, 256, 256, 5, 2, 4);
+}
+//
+TEST_F(DepthwiseDeconv2dOpTest, RandomTestHalf) {
+  RandomTest<half>(1, 32, 256, 256, 5, 1, 2);
+  RandomTest<half>(1, 3, 256, 256, 5, 1, 1);
+  RandomTest<half>(1, 3, 256, 256, 5, 2, 2);
+  RandomTest<half>(1, 3, 256, 256, 5, 1, 3);
+  RandomTest<half>(1, 3, 256, 256, 5, 2, 4);
+  RandomTest<half>(1, 4, 256, 256, 5, 1, 1);
+  RandomTest<half>(1, 4, 256, 256, 5, 2, 2);
+  RandomTest<half>(1, 4, 256, 256, 5, 1, 3);
+  RandomTest<half>(1, 4, 256, 256, 5, 2, 4);
+}
+
+}  // namespace
+}  // namespace test
+}  // namespace ops
+}  // namespace mace
diff --git a/mace/ops/opencl/cl/deconv_2d.cl b/mace/ops/opencl/cl/deconv_2d.cl
index ae08b409..d39d3fe3 100644
--- a/mace/ops/opencl/cl/deconv_2d.cl
+++ b/mace/ops/opencl/cl/deconv_2d.cl
@@ -163,4 +163,5 @@ __kernel void deconv_2d(OUT_OF_RANGE_PARAMS
     out_pos.x += stride_w;
     WRITE_IMAGET(output, out_pos, out4);
   }
-}
\ No newline at end of file
+}
+
diff --git a/mace/ops/opencl/cl/depthwise_deconv2d.cl b/mace/ops/opencl/cl/depthwise_deconv2d.cl
new file mode 100644
index 00000000..9d648b65
--- /dev/null
+++ b/mace/ops/opencl/cl/depthwise_deconv2d.cl
@@ -0,0 +1,149 @@
+#include <common.h>
+
+__kernel void depthwise_deconv2d(OUT_OF_RANGE_PARAMS
+                                 GLOBAL_WORK_GROUP_SIZE_DIM3
+                                 __read_only image2d_t input,
+                                 __read_only image2d_t weights,
+#ifdef BIAS
+                                 __read_only image2d_t bias,
+#endif
+                                 __write_only image2d_t output,
+                                 __private const float relux_max_limit,
+                                 __private const int in_height,
+                                 __private const int in_width,
+                                 __private const int out_height,
+                                 __private const int out_width,
+                                 __private const int out_channel,
+                                 __private const int stride_h,
+                                 __private const int stride_w,
+                                 __private const float stride_h_r,
+                                 __private const float stride_w_r,
+                                 __private const int align_h,
+                                 __private const int align_w,
+                                 __private const int padding_h,
+                                 __private const int padding_w,
+                                 __private const int kernel_h,
+                                 __private const int kernel_w,
+                                 __private const int kernel_size,
+                                 __private const int out_channel_blocks)
+{
+  const int c = get_global_id(0);
+  const int w_id = get_global_id(1);
+  const int hb = get_global_id(2);
+
+#ifndef NON_UNIFORM_WORK_GROUP
+  if (c >= global_size_dim0 || w_id >= global_size_dim1
+      || hb >= global_size_dim2) {
+    return;
+  }
+#endif
+
+#ifdef BIAS
+  DATA_TYPE4 out0 =
+     READ_IMAGET(bias, SAMPLER, (int2)(c, 0));
+  DATA_TYPE4 out1 = out0;
+  DATA_TYPE4 out2 = out0;
+  DATA_TYPE4 out3 = out0;
+  DATA_TYPE4 out4 = out0;
+#else
+  DATA_TYPE4 out0 = 0;
+  DATA_TYPE4 out1 = 0;
+  DATA_TYPE4 out2 = 0;
+  DATA_TYPE4 out3 = 0;
+  DATA_TYPE4 out4 = 0;
+#endif
+  const int n_stride = mad(w_id, stride_w_r, 0);
+  const int mod_stride = w_id - mul24(n_stride, stride_w);
+  const int w = mad24(mul24(n_stride, 5), stride_w, mod_stride);
+  const int b = hb / out_height;
+  const int h = hb - mul24(b, out_height);
+  if (w < out_width) {
+
+    int start_x = floor((float) (w + align_w) * stride_w_r);
+    int start_y = (h + align_h) * stride_h_r;
+    start_y = max(0, start_y);
+
+    int f_start_x = mad24(start_x, stride_w, padding_w) - w;
+    int f_start_y = mad24(start_y, stride_h, padding_h) - h;
+    f_start_x = kernel_w - 1 - f_start_x;
+    f_start_y = kernel_h - 1 - f_start_y;
+
+    int2 in_pos;
+    int f_pos;
+    DATA_TYPE4 in0, in1, in2, in3, in4;
+    DATA_TYPE4 weight;
+    int idx_w0, idx_w1, idx_w2, idx_w3, idx_w4;
+    int index_x, index_y;
+    for (int f_y = f_start_y, idx_h = start_y ; f_y >= 0; f_y -= stride_h, ++idx_h) {
+      index_y = mad24(b, in_height, idx_h);
+      in_pos.y = select(index_y, -1, idx_h < 0 || idx_h >= in_height);
+      for (int f_x = f_start_x, idx_w = start_x; f_x >= 0; f_x -= stride_w, ++idx_w) {
+        idx_w0 = idx_w;
+        idx_w1 = idx_w + 1;
+        idx_w2 = idx_w + 2;
+        idx_w3 = idx_w + 3;
+        idx_w4 = idx_w + 4;
+
+#define READ_INPUT(i)                                                      \
+        index_x = mad24(c, in_width, idx_w##i);                            \
+        in_pos.x =                                                         \
+          select(index_x, -1, idx_w##i < 0 || idx_w##i >= in_width);       \
+        in##i = READ_IMAGET(input, SAMPLER, in_pos);
+
+        READ_INPUT(0);
+        READ_INPUT(1);
+        READ_INPUT(2);
+        READ_INPUT(3);
+        READ_INPUT(4);
+#undef READ_INPUT
+
+        f_pos = mad24(f_y, kernel_w, f_x);
+        weight = READ_IMAGET(weights, SAMPLER, (int2)(f_pos, c));
+        out0 = mad(in0, weight, out0);
+        out1 = mad(in1, weight, out1);
+        out2 = mad(in2, weight, out2);
+        out3 = mad(in3, weight, out3);
+        out4 = mad(in4, weight, out4);
+
+      }
+    }
+
+#if defined(USE_RELU) || defined(USE_RELUX) || defined(USE_TANH) || defined(USE_SIGMOID)
+    out0 = do_activation(out0, relux_max_limit);
+    out1 = do_activation(out1, relux_max_limit);
+    out2 = do_activation(out2, relux_max_limit);
+    out3 = do_activation(out3, relux_max_limit);
+    out4 = do_activation(out4, relux_max_limit);
+#endif
+
+
+
+    int2 out_pos;
+    out_pos.y = hb;
+
+    int ow = w;
+    if (ow >= out_width) return;
+    out_pos.x = mad24(c, out_width, ow);
+    WRITE_IMAGET(output, out_pos, out0);
+
+    ow += stride_w;
+    if (ow >= out_width) return;
+    out_pos.x += stride_w;
+    WRITE_IMAGET(output, out_pos, out1);
+
+    ow += stride_w;
+    if (ow >= out_width) return;
+    out_pos.x += stride_w;
+    WRITE_IMAGET(output, out_pos, out2);
+
+    ow += stride_w;
+    if (ow >= out_width) return;
+    out_pos.x += stride_w;
+    WRITE_IMAGET(output, out_pos, out3);
+
+    ow += stride_w;
+    if (ow >= out_width) return;
+    out_pos.x += stride_w;
+    WRITE_IMAGET(output, out_pos, out4);
+  }
+}
\ No newline at end of file
diff --git a/mace/ops/opencl/depthwise_deconv2d.h b/mace/ops/opencl/depthwise_deconv2d.h
new file mode 100644
index 00000000..994c98a2
--- /dev/null
+++ b/mace/ops/opencl/depthwise_deconv2d.h
@@ -0,0 +1,49 @@
+// Copyright 2018 Xiaomi, Inc.  All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef MACE_OPS_OPENCL_DEPTHWISE_DECONV2D_H_
+#define MACE_OPS_OPENCL_DEPTHWISE_DECONV2D_H_
+
+#include <vector>
+
+#include "mace/ops/activation.h"
+
+namespace mace {
+
+class OpContext;
+class Tensor;
+
+namespace ops {
+
+class OpenCLDepthwiseDeconv2dKernel {
+ public:
+  virtual MaceStatus Compute(
+      OpContext *context,
+      const Tensor *input,
+      const Tensor *filter,
+      const Tensor *bias,
+      const int *strides,
+      const int *padding_data,
+      const int group,
+      const ActivationType activation,
+      const float relux_max_limit,
+      const std::vector <index_t> &output_shape,
+      Tensor *output) = 0;
+  MACE_EMPTY_VIRTUAL_DESTRUCTOR(OpenCLDepthwiseDeconv2dKernel);
+};
+
+}  // namespace ops
+}  // namespace mace
+
+#endif  // MACE_OPS_OPENCL_DEPTHWISE_DECONV2D_H_
diff --git a/mace/ops/opencl/image/deconv_2d.h b/mace/ops/opencl/image/deconv_2d.h
index c4dfa2bf..a0f51874 100644
--- a/mace/ops/opencl/image/deconv_2d.h
+++ b/mace/ops/opencl/image/deconv_2d.h
@@ -79,10 +79,10 @@ MaceStatus Deconv2dKernel<T>::Compute(
   const int stride_h = strides[0];
   const int stride_w = strides[1];
   MACE_CHECK(stride_w > 0 && stride_h > 0, "strides should be > 0.");
-#define MACE_WIDTH_BLK 5
+  const int width_tile = 5;
   const index_t n_strides = (width + stride_w - 1) / stride_w;
   const index_t width_blocks =
-      ((n_strides + MACE_WIDTH_BLK - 1) / MACE_WIDTH_BLK) * stride_w;
+      ((n_strides + width_tile - 1) / width_tile) * stride_w;
   const float stride_h_r = 1.f / static_cast<float>(stride_h);
   const float stride_w_r = 1.f / static_cast<float>(stride_w);
   const int padding_h = (padding_data[0] + 1) >> 1;
diff --git a/mace/ops/opencl/image/depthwise_deconv2d.h b/mace/ops/opencl/image/depthwise_deconv2d.h
new file mode 100644
index 00000000..040c349d
--- /dev/null
+++ b/mace/ops/opencl/image/depthwise_deconv2d.h
@@ -0,0 +1,196 @@
+// Copyright 2018 Xiaomi, Inc.  All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#ifndef MACE_OPS_OPENCL_IMAGE_DEPTHWISE_DECONV2D_H_
+#define MACE_OPS_OPENCL_IMAGE_DEPTHWISE_DECONV2D_H_
+
+#include "mace/ops/opencl/depthwise_deconv2d.h"
+
+#include <memory>
+#include <set>
+#include <string>
+#include <vector>
+
+#include "mace/core/op_context.h"
+#include "mace/core/tensor.h"
+#include "mace/ops/opencl/helper.h"
+
+namespace mace {
+namespace ops {
+namespace opencl {
+namespace image {
+
+template <typename T>
+class DepthwiseDeconv2dKernel : public OpenCLDepthwiseDeconv2dKernel {
+ public:
+  MaceStatus Compute(
+      OpContext *context,
+      const Tensor *input,
+      const Tensor *filter,
+      const Tensor *bias,
+      const int *strides,
+      const int *padding_data,
+      const int group,
+      const ActivationType activation,
+      const float relux_max_limit,
+      const std::vector<index_t> &output_shape,
+      Tensor *output) override;
+
+ private:
+  cl::Kernel kernel_;
+  uint32_t kwg_size_;
+  std::vector<index_t> input_shape_;
+};
+
+template <typename T>
+MaceStatus DepthwiseDeconv2dKernel<T>::Compute(
+    OpContext *context,
+    const Tensor *input,
+    const Tensor *filter,
+    const Tensor *bias,
+    const int *strides,
+    const int *padding_data,
+    const int group,
+    const ActivationType activation,
+    const float relux_max_limit,
+    const std::vector<index_t> &output_shape,
+    Tensor *output) {
+  const index_t batch = output_shape[0];
+  const index_t height = output_shape[1];
+  const index_t width = output_shape[2];
+  const index_t channels = output_shape[3];
+  const index_t input_channels = input->dim(3);
+  const index_t multiplier = filter->dim(0);
+
+  MACE_CHECK(group == channels && group == input_channels && multiplier == 1,
+             "opencl image deconv only supports depthwise type group.");
+
+  std::vector<size_t> output_image_shape;
+  CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL,
+                  &output_image_shape);
+  MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, output_image_shape));
+  const DataType dt = DataTypeToEnum<T>::value;
+
+  const index_t channel_blocks = RoundUpDiv4(channels);
+  const int stride_h = strides[0];
+  const int stride_w = strides[1];
+  MACE_CHECK(stride_w > 0 && stride_h > 0, "strides should be > 0.");
+  const int width_tile = 5;
+  const index_t n_strides = (width + stride_w - 1) / stride_w;
+  const index_t width_blocks =
+      ((n_strides + width_tile - 1) / width_tile) * stride_w;
+  const float stride_h_r = 1.f / static_cast<float>(stride_h);
+  const float stride_w_r = 1.f / static_cast<float>(stride_w);
+  const int padding_h = (padding_data[0] + 1) >> 1;
+  const int padding_w = (padding_data[1] + 1) >> 1;
+
+  const int align_h = stride_h - 1 - padding_h;
+  const int align_w = stride_w - 1 - padding_w;
+  const int kernel_size = filter->dim(2) * filter->dim(3);
+
+  auto runtime = context->device()->opencl_runtime();
+  MACE_OUT_OF_RANGE_DEFINITION;
+
+  if (kernel_.get() == nullptr) {
+    std::set<std::string> built_options;
+    MACE_OUT_OF_RANGE_CONFIG;
+    MACE_NON_UNIFORM_WG_CONFIG;
+    std::string kernel_name = MACE_OBFUSCATE_SYMBOL("depthwise_deconv2d");
+    built_options.emplace("-Ddepthwise_deconv2d=" + kernel_name);
+    built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt));
+    built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt));
+    built_options.emplace(bias != nullptr ? "-DBIAS" : "");
+    switch (activation) {
+      case NOOP:
+        break;
+      case RELU:
+        built_options.emplace("-DUSE_RELU");
+        break;
+      case RELUX:
+        built_options.emplace("-DUSE_RELUX");
+        break;
+      case TANH:
+        built_options.emplace("-DUSE_TANH");
+        break;
+      case SIGMOID:
+        built_options.emplace("-DUSE_SIGMOID");
+        break;
+      default:
+        LOG(FATAL) << "Unknown activation type: " << activation;
+    }
+
+    MACE_RETURN_IF_ERROR(runtime->BuildKernel("depthwise_deconv2d", kernel_name,
+                                              built_options, &kernel_));
+
+    kwg_size_ =
+        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
+  }
+
+  const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
+                           static_cast<uint32_t>(width_blocks),
+                           static_cast<uint32_t>(height * batch)};
+
+  MACE_OUT_OF_RANGE_INIT(kernel_);
+  if (!IsVecEqual(input_shape_, input->shape())) {
+    uint32_t idx = 0;
+    MACE_OUT_OF_RANGE_SET_ARGS(kernel_);
+    MACE_SET_3D_GWS_ARGS(kernel_, gws);
+    kernel_.setArg(idx++, *(input->opencl_image()));
+    kernel_.setArg(idx++, *(filter->opencl_image()));
+    if (bias != nullptr) {
+      kernel_.setArg(idx++, *(bias->opencl_image()));
+    }
+    kernel_.setArg(idx++, *(output->opencl_image()));
+    kernel_.setArg(idx++, relux_max_limit);
+    kernel_.setArg(idx++, static_cast<int32_t>(input->dim(1)));
+    kernel_.setArg(idx++, static_cast<int32_t>(input->dim(2)));
+    kernel_.setArg(idx++, static_cast<int32_t>(height));
+    kernel_.setArg(idx++, static_cast<int32_t>(width));
+    kernel_.setArg(idx++, static_cast<int32_t>(channels));
+    kernel_.setArg(idx++, static_cast<int32_t>(stride_h));
+    kernel_.setArg(idx++, static_cast<int32_t>(stride_w));
+    kernel_.setArg(idx++, stride_h_r);
+    kernel_.setArg(idx++, stride_w_r);
+    kernel_.setArg(idx++, static_cast<int32_t>(align_h));
+    kernel_.setArg(idx++, static_cast<int32_t>(align_w));
+    kernel_.setArg(idx++, static_cast<int32_t>(padding_h));
+    kernel_.setArg(idx++, static_cast<int32_t>(padding_w));
+    kernel_.setArg(idx++, static_cast<int32_t>(filter->dim(2)));
+    kernel_.setArg(idx++, static_cast<int32_t>(filter->dim(3)));
+    kernel_.setArg(idx++, static_cast<int32_t>(kernel_size));
+    kernel_.setArg(idx++, static_cast<int32_t>(channel_blocks));
+
+    input_shape_ = input->shape();
+  }
+
+  const std::vector<uint32_t> lws = Default3DLocalWS(runtime, gws, kwg_size_);
+  std::string tuning_key =
+      Concat("depthwise_deconv2d_kernel_",
+             activation,
+             output->dim(0),
+             output->dim(1),
+             output->dim(2),
+             output->dim(3));
+  MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key,
+                                           gws, lws, context->future()));
+
+  MACE_OUT_OF_RANGE_VALIDATION;
+  return MaceStatus::MACE_SUCCESS;
+}
+
+}  // namespace image
+}  // namespace opencl
+}  // namespace ops
+}  // namespace mace
+
+#endif  // MACE_OPS_OPENCL_IMAGE_DEPTHWISE_DECONV2D_H_
diff --git a/mace/ops/ops_registry.cc b/mace/ops/ops_registry.cc
index e330d66a..bd3038c5 100644
--- a/mace/ops/ops_registry.cc
+++ b/mace/ops/ops_registry.cc
@@ -32,6 +32,7 @@ extern void RegisterCrop(OpRegistryBase *op_registry);
 extern void RegisterDeconv2D(OpRegistryBase *op_registry);
 extern void RegisterDepthToSpace(OpRegistryBase *op_registry);
 extern void RegisterDepthwiseConv2d(OpRegistryBase *op_registry);
+extern void RegisterDepthwiseDeconv2d(OpRegistryBase *op_registry);
 extern void RegisterDequantize(OpRegistryBase *op_registry);
 extern void RegisterEltwise(OpRegistryBase *op_registry);
 extern void RegisterExpandDims(OpRegistryBase *op_registry);
@@ -89,6 +90,7 @@ OpRegistry::OpRegistry() : OpRegistryBase() {
   ops::RegisterDeconv2D(this);
   ops::RegisterDepthToSpace(this);
   ops::RegisterDepthwiseConv2d(this);
+  ops::RegisterDepthwiseDeconv2d(this);
   ops::RegisterDequantize(this);
   ops::RegisterEltwise(this);
   ops::RegisterExpandDims(this);
diff --git a/mace/python/tools/converter_tool/base_converter.py b/mace/python/tools/converter_tool/base_converter.py
index e7dff9bd..fe030985 100644
--- a/mace/python/tools/converter_tool/base_converter.py
+++ b/mace/python/tools/converter_tool/base_converter.py
@@ -91,6 +91,7 @@ MaceSupportedOps = [
     'Deconv2D',
     'DepthToSpace',
     'DepthwiseConv2d',
+    'DepthwiseDeconv2d',
     'Dequantize',
     'Eltwise',
     'ExpandDims',
@@ -183,6 +184,7 @@ class MaceKeyword(object):
     mace_scalar_input_index_str = 'scalar_input_index'
     mace_opencl_mem_type = "opencl_mem_type"
     mace_framework_type_str = "framework_type"
+    mace_group_str = "group"
 
 
 class TransformerRule(Enum):
diff --git a/mace/python/tools/converter_tool/caffe_converter.py b/mace/python/tools/converter_tool/caffe_converter.py
index 374d1073..a2e89a6d 100644
--- a/mace/python/tools/converter_tool/caffe_converter.py
+++ b/mace/python/tools/converter_tool/caffe_converter.py
@@ -411,17 +411,14 @@ class CaffeConverter(base_converter.ConverterInterface):
     def convert_deconv2d(self, caffe_op):
         op = self.convert_general_op(caffe_op)
         param = caffe_op.layer.convolution_param
-        is_depthwise = False
-        if param.HasField(caffe_group_str) and param.group > 1:
-            filter_data = caffe_op.blobs[0]
-            mace_check(param.group == filter_data.shape[0] and
-                       filter_data.shape[1] == 1,
-                       "Mace does not support group deconvolution yet")
-            is_depthwise = True
-        mace_check(is_depthwise is False,
-                   "Mace do not support depthwise deconvolution yet")
 
-        op.type = MaceOp.Deconv2D.name
+        if param.HasField(caffe_group_str) and param.group > 1:
+            group_arg = op.arg.add()
+            group_arg.name = MaceKeyword.mace_group_str
+            group_arg.i = param.group
+            op.type = MaceOp.DepthwiseDeconv2d.name
+        else:
+            op.type = MaceOp.Deconv2D.name
 
         self.add_stride_pad_kernel_arg(param, op)
         # dilation is specific for convolution in caffe
diff --git a/mace/python/tools/converter_tool/shape_inference.py b/mace/python/tools/converter_tool/shape_inference.py
index e62affaf..fbc22783 100644
--- a/mace/python/tools/converter_tool/shape_inference.py
+++ b/mace/python/tools/converter_tool/shape_inference.py
@@ -36,6 +36,7 @@ class ShapeInference(object):
             MaceOp.Conv2D.name: self.infer_shape_conv_pool_shape,
             MaceOp.Deconv2D.name: self.infer_shape_deconv,
             MaceOp.DepthwiseConv2d.name: self.infer_shape_conv_pool_shape,
+            MaceOp.DepthwiseDeconv2d.name: self.infer_shape_deconv,
             MaceOp.Eltwise.name: self.infer_shape_general,
             MaceOp.BatchNorm.name: self.infer_shape_general,
             MaceOp.AddN.name: self.infer_shape_general,
@@ -159,11 +160,15 @@ class ShapeInference(object):
             dilations = [1, 1]
         round_func = math.floor
 
+        group_arg = ConverterUtil.get_arg(op,
+                                          MaceKeyword.mace_group_str)
         output_shape[0] = input_shape[0]
         if ConverterUtil.data_format(op) == DataFormat.NCHW \
                 and ConverterUtil.filter_format(self._net) == FilterFormat.OIHW:  # noqa
             # filter format: IOHW
             output_shape[1] = filter_shape[1]
+            if group_arg is not None and group_arg.i > 1:
+                output_shape[1] = group_arg.i * filter_shape[1]
             output_shape[2] = int(
                 round_func((input_shape[2] - 1) * strides[0] +
                            (filter_shape[2] - 1) * (dilations[0] - 1) +
diff --git a/mace/python/tools/converter_tool/transformer.py b/mace/python/tools/converter_tool/transformer.py
index 1ab81452..107aab02 100644
--- a/mace/python/tools/converter_tool/transformer.py
+++ b/mace/python/tools/converter_tool/transformer.py
@@ -583,7 +583,7 @@ class Transformer(base_converter.ConverterInterface):
     def fold_deconv_and_bn(self):
         net = self._model
         for op in net.op:
-            if (op.type == MaceOp.Deconv2D.name) \
+            if (op.type in [MaceOp.Deconv2D.name, MaceOp.DepthwiseDeconv2d]) \
                     and self.consumer_count(op.output[0]) == 1:
                 consumer_op = self._consumers[op.output[0]][0]
                 if consumer_op.type == MaceOp.BatchNorm.name:
@@ -1365,7 +1365,8 @@ class Transformer(base_converter.ConverterInterface):
                 self.set_filter_format(FilterFormat.OIHW)
             # deconv's filter's output channel and input channel is reversed
             for op in net.op:
-                if op.type == MaceOp.Deconv2D.name \
+                if op.type in [MaceOp.Deconv2D.name,
+                               MaceOp.DepthwiseDeconv2d] \
                         and op.input[1] not in transposed_deconv_filter:
                     filter = self._consts[op.input[1]]
                     filter_data = np.array(filter.float_data).reshape(
@@ -1427,11 +1428,17 @@ class Transformer(base_converter.ConverterInterface):
                 self.buffer_transform(op, 1, OpenCLBufferType.CONV2D_FILTER)
                 if len(op.input) >= 3:
                     self.buffer_transform(op, 2, OpenCLBufferType.ARGUMENT)
-            elif op.type == MaceOp.Deconv2D.name:
-                self.buffer_transform(op, 1, OpenCLBufferType.CONV2D_FILTER)
+            elif op.type == MaceOp.Deconv2D.name\
+                    or op.type == MaceOp.DepthwiseDeconv2d.name:
+                if op.type == MaceOp.Deconv2D.name:
+                    self.buffer_transform(op, 1,
+                                          OpenCLBufferType.CONV2D_FILTER)
+                elif op.type == MaceOp.DepthwiseDeconv2d.name:
+                    self.buffer_transform(op, 1,
+                                          OpenCLBufferType.DW_CONV2D_FILTER)
                 if ConverterUtil.get_arg(
                         op,
-                        MaceKeyword.mace_framework_type_str).i ==\
+                        MaceKeyword.mace_framework_type_str).i == \
                         FrameworkType.CAFFE.value:
                     if len(op.input) >= 3:
                         self.buffer_transform(op, 2, OpenCLBufferType.ARGUMENT)
@@ -1456,8 +1463,10 @@ class Transformer(base_converter.ConverterInterface):
                 if len(op.input) >= 4:
                     self.buffer_transform(op, 3, OpenCLBufferType.ARGUMENT)
             elif op.type == MaceOp.MatMul.name and \
-                    ConverterUtil.get_arg(op,
-                                          MaceKeyword.mace_winograd_filter_transformed) is not None:  # noqa
+                    ConverterUtil.get_arg(
+                        op,
+                        MaceKeyword.mace_winograd_filter_transformed
+                    ) is not None:  # noqa
                 self.buffer_transform(op, 0, OpenCLBufferType.WINOGRAD_FILTER)
             elif op.type == MaceOp.WinogradInverseTransform.name \
                     and len(op.input) >= 3:
@@ -1467,8 +1476,10 @@ class Transformer(base_converter.ConverterInterface):
                 if len(op.input) >= 3:
                     self.buffer_transform(op, 2, OpenCLBufferType.ARGUMENT)
             elif op.type == MaceOp.Activation.name:
-                if ConverterUtil.get_arg(op,
-                                         MaceKeyword.mace_activation_type_str).s == ActivationType.PRELU.name:  # noqa
+                if ConverterUtil.get_arg(
+                        op,
+                        MaceKeyword.mace_activation_type_str
+                ).s == ActivationType.PRELU.name:  # noqa
                     self.buffer_transform(op, 1, OpenCLBufferType.ARGUMENT)
             elif op.type == MaceOp.LSTMCell.name:
                 if op.input[1] in self._consts:
@@ -1793,24 +1804,24 @@ class Transformer(base_converter.ConverterInterface):
             check_conv = False
             check_deconv = False
             if ops is not None and len(ops) == 1:
-                check_conv =\
-                    ops[0].type in [MaceOp.Conv2D.name,
-                                    MaceOp.DepthwiseConv2d.name,
-                                    MaceOp.FullyConnected.name]\
-                    and len(ops[0].input) >= 3\
-                    and ops[0].input[2] == tensor.name
+                if len(ops[0].input) >= 3:
+                    check_conv =\
+                        ops[0].type in [MaceOp.Conv2D.name,
+                                        MaceOp.DepthwiseConv2d.name,
+                                        MaceOp.FullyConnected.name]\
+                        and ops[0].input[2] == tensor.name
                 # in tensorflow deconv's bias is the forth input
-                if ops[0].type == MaceOp.Deconv2D.name:
+                if ops[0].type in [MaceOp.Deconv2D.name,
+                                   MaceOp.DepthwiseDeconv2d]:
                     from_caffe = ConverterUtil.get_arg(
                         ops[0],
                         MaceKeyword.mace_framework_type_str).i ==\
                                  FrameworkType.CAFFE.value
-                    if from_caffe:
-                        check_deconv = len(ops[0].input) >= 3\
-                                       and ops[0].input[2] == tensor.name
+                    if from_caffe and len(ops[0].input) >= 3:
+                        check_deconv = ops[0].input[2] == tensor.name
                     else:
-                        check_deconv = len(ops[0].input) >= 4\
-                                       and ops[0].input[3] == tensor.name
+                        if len(ops[0].input) >= 4:
+                            check_deconv = ops[0].input[3] == tensor.name
             if check_conv or check_deconv:
                 if self._option.device == DeviceType.CPU.value:
                     conv_op = ops[0]
diff --git a/repository/opencl-kernel/opencl_kernel_configure.bzl b/repository/opencl-kernel/opencl_kernel_configure.bzl
index c844a5d3..65cc5635 100644
--- a/repository/opencl-kernel/opencl_kernel_configure.bzl
+++ b/repository/opencl-kernel/opencl_kernel_configure.bzl
@@ -37,6 +37,7 @@ def _opencl_encrypt_kernel_impl(repository_ctx):
     unused_var = repository_ctx.path(Label("//:mace/ops/opencl/cl/conv_2d_buffer.cl"))
     unused_var = repository_ctx.path(Label("//:mace/ops/opencl/cl/crop.cl"))
     unused_var = repository_ctx.path(Label("//:mace/ops/opencl/cl/deconv_2d.cl"))
+    unused_var = repository_ctx.path(Label("//:mace/ops/opencl/cl/depthwise_deconv2d.cl"))
     unused_var = repository_ctx.path(Label("//:mace/ops/opencl/cl/depth_to_space.cl"))
     unused_var = repository_ctx.path(Label("//:mace/ops/opencl/cl/depthwise_conv2d.cl"))
     unused_var = repository_ctx.path(Label("//:mace/ops/opencl/cl/depthwise_conv2d_buffer.cl"))
diff --git a/tools/validate.py b/tools/validate.py
index be499c1a..3c85fb55 100644
--- a/tools/validate.py
+++ b/tools/validate.py
@@ -18,8 +18,6 @@ import os
 import os.path
 import numpy as np
 import re
-from scipy import spatial
-from scipy import stats
 
 import common
 
@@ -60,14 +58,22 @@ def calculate_sqnr(expected, actual):
     return signal_power_sum / (noise_power_sum + 1e-15)
 
 
+def calculate_similarity(u, v, data_type=np.float64):
+    if u.dtype is not data_type:
+        u = u.astype(data_type)
+    if v.dtype is not data_type:
+        v = v.astype(data_type)
+    return np.dot(u, v) / (np.linalg.norm(u) * np.linalg.norm(v))
+
+
 def compare_output(platform, device_type, output_name, mace_out_value,
                    out_value, validation_threshold):
     if mace_out_value.size != 0:
         out_value = out_value.reshape(-1)
         mace_out_value = mace_out_value.reshape(-1)
         assert len(out_value) == len(mace_out_value)
-        similarity = (1 - spatial.distance.cosine(out_value, mace_out_value))
         sqnr = calculate_sqnr(out_value, mace_out_value)
+        similarity = calculate_similarity(out_value, mace_out_value)
         common.MaceLogger.summary(
             output_name + ' MACE VS ' + platform.upper()
             + ' similarity: ' + str(similarity) + ' , sqnr: ' + str(sqnr))
-- 
GitLab