add 2x2s2_deconv for neon and optimize deconv and dw_deconv ops' structure

48e6ee96 · liutuo · fc7f4967 · 48e6ee96 · 48e6ee96 · 48e6ee96
7 changed file
--- a/.gitignore
+++ b/.gitignore
@@ -22,3 +22,9 @@ mace/examples/android/macelibrary/src/main/cpp/mace/
 *swp
 *~
 .python-version
+
+mace/examples/android/macelibrary/src/main/cpp/include/mace/public/
+
+mace/examples/android/macelibrary/src/main/cpp/lib/arm64-v8a/
+
+mace/examples/android/macelibrary/src/main/jniLibs/arm64-v8a/
--- a/mace/ops/arm/deconv_2d_neon.h
+++ b/mace/ops/arm/deconv_2d_neon.h
@@ -21,6 +21,18 @@
 namespace mace {
 namespace ops {

+void Deconv2dNeonK2x2S1(const float *input,
+                        const float *filter,
+                        const index_t *in_shape,
+                        const index_t *out_shape,
+                        float *output);
+
+void Deconv2dNeonK2x2S2(const float *input,
+                        const float *filter,
+                        const index_t *in_shape,
+                        const index_t *out_shape,
+                        float *output);
+
 void Deconv2dNeonK3x3S1(const float *input,
                        const float *filter,
                        const index_t *in_shape,

--- a/mace/ops/arm/deconv_2d_neon_2x2.cc
+++ b/mace/ops/arm/deconv_2d_neon_2x2.cc
+// Copyright 2018 Xiaomi, Inc.  All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "mace/core/macros.h"
+#include "mace/ops/arm/deconv_2d_neon.h"
+
+namespace mace {
+namespace ops {
+
+void Deconv2dNeonK2x2S1(const float *input,
+                        const float *filter,
+                        const index_t *in_shape,
+                        const index_t *out_shape,
+                        float *output) {
+  const index_t inch = in_shape[1];
+  const index_t h = in_shape[2];
+  const index_t w = in_shape[3];
+
+  const index_t outch = out_shape[1];
+  const index_t outh = out_shape[2];
+  const index_t outw = out_shape[3];
+
+  const index_t out_img_size = outh * outw;
+
+#pragma omp parallel for collapse(2)
+  for (index_t b = 0; b < out_shape[0]; ++b) {
+    for (index_t oc = 0; oc < outch; oc += 2) {
+      if (oc + 1 < outch) {
+        float *out_base0 = output + (b * outch + oc) * out_img_size;
+        float *out_base1 = out_base0 + out_img_size;
+        for (index_t ic = 0; ic < inch; ++ic) {
+          const float *input_base = input + (b * inch + ic) * h * w;
+          const float *kernel_base0 = filter + (oc * inch + ic) * 4;
+          const float *kernel_base1 = kernel_base0 + inch * 4;
+          const float *in = input_base;
+          // output channel 0
+          const float *k0 = kernel_base0;
+          // output channel 1
+          const float *k1 = kernel_base1;
+#if defined(MACE_ENABLE_NEON)
+          // load filter
+          float32x4_t k0_vec = vld1q_f32(k0);
+          float32x4_t k1_vec = vld1q_f32(k1);
+#endif
+          for (index_t i = 0; i < h; ++i) {
+            float *out_row_base0 = out_base0 + i * outw;
+            float *out_row0_0 = out_row_base0;
+            float *out_row0_1 = out_row_base0 + outw;
+
+            float *out_row_base1 = out_base1 + i * outw;
+            float *out_row1_0 = out_row_base1;
+            float *out_row1_1 = out_row_base1 + outw;
+
+            index_t j = 0;
+#if defined(MACE_ENABLE_NEON)
+            for (; j + 3 < w; j += 4) {
+              float32x4_t in_vec = vld1q_f32(in);
+
+              float32x4_t out00, out01, out02, out03;
+              float32x4_t out10, out11, out12, out13;
+
+              out00 = vld1q_f32(out_row0_0);
+              out00 = neon_vfma_lane_0(out00, in_vec, k0_vec);
+              vst1q_f32(out_row0_0, out00);
+
+              out01 = vld1q_f32(out_row0_0 + 1);
+              out01 = neon_vfma_lane_1(out01, in_vec, k0_vec);
+              vst1q_f32(out_row0_0 + 1, out01);
+
+              out02 = vld1q_f32(out_row0_1);
+              out02 = neon_vfma_lane_2(out02, in_vec, k0_vec);
+              vst1q_f32(out_row0_1, out02);
+
+              out03 = vld1q_f32(out_row0_1 + 1);
+              out03 = neon_vfma_lane_3(out03, in_vec, k0_vec);
+              vst1q_f32(out_row0_1 + 1, out03);
+
+              out10 = vld1q_f32(out_row1_0);
+              out10 = neon_vfma_lane_0(out10, in_vec, k1_vec);
+              vst1q_f32(out_row1_0, out10);
+
+              out11 = vld1q_f32(out_row1_0 + 1);
+              out11 = neon_vfma_lane_1(out11, in_vec, k1_vec);
+              vst1q_f32(out_row1_0 + 1, out11);
+
+              out12 = vld1q_f32(out_row1_1);
+              out12 = neon_vfma_lane_2(out12, in_vec, k1_vec);
+              vst1q_f32(out_row1_1, out12);
+
+              out13 = vld1q_f32(out_row1_1 + 1);
+              out13 = neon_vfma_lane_3(out13, in_vec, k1_vec);
+              vst1q_f32(out_row1_1 + 1, out13);
+
+              in += 4;
+              out_row0_0 += 4;
+              out_row0_1 += 4;
+              out_row1_0 += 4;
+              out_row1_1 += 4;
+            }
+#endif
+            for (; j < w; ++j) {
+              float val = in[0];
+              for (int k = 0; k < 2; ++k) {
+                out_row0_0[k] += val * k0[k];
+                out_row0_1[k] += val * k0[k + 2];
+                out_row1_0[k] += val * k1[k];
+                out_row1_1[k] += val * k1[k + 2];
+              }
+              in++;
+              out_row0_0++;
+              out_row0_1++;
+              out_row1_0++;
+              out_row1_1++;
+            }
+          }
+        }
+      } else {
+        float *out_base0 = output + (b * outch + oc) * outh * outw;
+        for (index_t ic = 0; ic < inch; ++ic) {
+          const float *input_base = input + (b * inch + ic) * h * w;
+          const float *kernel_base0 = filter + (oc * inch + ic) * 4;
+          const float *in = input_base;
+          const float *k0 = kernel_base0;
+
+#if defined(MACE_ENABLE_NEON)
+          // load filter
+          float32x4_t k0_vec = vld1q_f32(k0);
+#endif
+          for (index_t i = 0; i < h; ++i) {
+            float *out_row_base0 = out_base0 + i * outw;
+            float *out_row0_0 = out_row_base0;
+            float *out_row0_1 = out_row_base0 + outw;
+            index_t j = 0;
+#if defined(MACE_ENABLE_NEON)
+            for (; j + 3 < w; j += 4) {
+              float32x4_t in_vec = vld1q_f32(in);
+              float32x4_t out00, out01, out02, out03;
+
+              out00 = vld1q_f32(out_row0_0);
+              out00 = neon_vfma_lane_0(out00, in_vec, k0_vec);
+              vst1q_f32(out_row0_0, out00);
+
+              out01 = vld1q_f32(out_row0_0 + 1);
+              out01 = neon_vfma_lane_1(out01, in_vec, k0_vec);
+              vst1q_f32(out_row0_0 + 1, out01);
+
+              out02 = vld1q_f32(out_row0_1);
+              out02 = neon_vfma_lane_2(out02, in_vec, k0_vec);
+              vst1q_f32(out_row0_1, out02);
+
+              out03 = vld1q_f32(out_row0_1 + 1);
+              out03 = neon_vfma_lane_3(out03, in_vec, k0_vec);
+              vst1q_f32(out_row0_1 + 1, out03);
+
+              in += 4;
+              out_row0_0 += 4;
+              out_row0_1 += 4;
+            }
+#endif
+            for (; j < w; ++j) {
+              float val = in[0];
+              for (int k = 0; k < 2; ++k) {
+                out_row0_0[k] += val * k0[k];
+                out_row0_1[k] += val * k0[k + 2];
+              }
+              in++;
+              out_row0_0++;
+              out_row0_1++;
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+void Deconv2dNeonK2x2S2(const float *input,
+                        const float *filter,
+                        const index_t *in_shape,
+                        const index_t *out_shape,
+                        float *output) {
+  const index_t inch = in_shape[1];
+  const index_t h = in_shape[2];
+  const index_t w = in_shape[3];
+
+  const index_t outch = out_shape[1];
+  const index_t outh = out_shape[2];
+  const index_t outw = out_shape[3];
+  const index_t out_img_size = outh * outw;
+
+#pragma omp parallel for collapse(2)
+  for (index_t b = 0; b < out_shape[0]; ++b) {
+    for (index_t oc = 0; oc < outch; ++oc) {
+      float *out_base = output + (b * outch + oc) * out_img_size;
+      for (index_t ic = 0; ic < inch; ++ic) {
+        const float *input_base = input + (b * inch + ic) * h * w;
+        const float *kernel_base = filter + (oc * inch + ic) * 4;
+        const float *in = input_base;
+        const float *k0 = kernel_base;
+#if defined(MACE_ENABLE_NEON)
+        float32x4_t k0_vec = vld1q_f32(k0);
+#endif
+        for (index_t i = 0; i < h; ++i) {
+          float *out_row_base = out_base + i * 2 * outw;
+          float *out_row_0 = out_row_base;
+          float *out_row_1 = out_row_0 + outw;
+
+          index_t j = 0;
+#if defined(MACE_ENABLE_NEON)
+          for (; j + 3 < w; j += 4) {
+            float32x4_t in_vec = vld1q_f32(in);
+
+            // out row 0
+            float32x4x2_t out00 = vld2q_f32(out_row_0);
+            out00.val[0] =
+              neon_vfma_lane_0(out00.val[0], in_vec, k0_vec);
+            out00.val[1] =
+              neon_vfma_lane_1(out00.val[1], in_vec, k0_vec);
+            vst2q_f32(out_row_0, out00);
+
+            // out row 1
+            float32x4x2_t out10 = vld2q_f32(out_row_1);
+            out10.val[0] =
+              neon_vfma_lane_2(out10.val[0], in_vec, k0_vec);
+            out10.val[1] =
+              neon_vfma_lane_3(out10.val[1], in_vec, k0_vec);
+            vst2q_f32(out_row_1, out10);
+
+            in += 4;
+            out_row_0 += 8;
+            out_row_1 += 8;
+          }
+#endif
+          for (; j < w; ++j) {
+            float val = in[0];
+            for (int k = 0; k < 2; ++k) {
+              out_row_0[k] += val * k0[k];
+              out_row_1[k] += val * k0[k + 2];
+            }
+            in++;
+            out_row_0 += 2;
+            out_row_1 += 2;
+          }
+        }
+      }
+    }
+  }
+}
+
+}  // namespace ops
+}  // namespace mace
--- a/mace/ops/deconv_2d.cc
+++ b/mace/ops/deconv_2d.cc
@@ -25,11 +25,9 @@
 #include <vector>

 #include "mace/core/future.h"
-#include "mace/core/operator.h"
 #include "mace/core/tensor.h"
 #include "mace/ops/activation.h"
 #include "mace/ops/arm/deconv_2d_neon.h"
-#include "mace/ops/conv_pool_2d_util.h"
 #include "mace/utils/utils.h"
 #ifdef MACE_ENABLE_OPENCL
 #include "mace/ops/opencl/image/deconv_2d.h"
@@ -38,134 +36,6 @@
 namespace mace {
 namespace ops {

-class Deconv2dOpBase : public Operation {
- public:
-  explicit Deconv2dOpBase(OpConstructContext *context)
-      : Operation(context),
-        strides_(Operation::GetRepeatedArgs<int>("strides")),
-        padding_type_(static_cast<Padding>(Operation::GetOptionalArg<int>(
-            "padding", static_cast<int>(SAME)))),
-        paddings_(Operation::GetRepeatedArgs<int>("padding_values")),
-        model_type_(static_cast<ops::FrameworkType>(
-                        Operation::GetOptionalArg<int>("framework_type", 0))),
-        activation_(ops::StringToActivationType(
-            Operation::GetOptionalArg<std::string>("activation",
-                                                  "NOOP"))),
-        relux_max_limit_(Operation::GetOptionalArg<float>("max_limit", 0.0f)) {}
-
-
-  static void CalcDeconvOutputSize(
-      const index_t *input_shape,   // NHWC
-      const index_t *filter_shape,  // OIHW
-      const int *strides,
-      index_t *output_shape,
-      const int *padding_size,
-      int *input_padding,
-      const bool isNCHW = false) {
-    MACE_CHECK_NOTNULL(output_shape);
-    MACE_CHECK_NOTNULL(padding_size);
-    MACE_CHECK_NOTNULL(input_shape);
-    MACE_CHECK_NOTNULL(filter_shape);
-    MACE_CHECK_NOTNULL(strides);
-
-    const index_t output_channel = filter_shape[0];
-
-    const index_t in_height = isNCHW ? input_shape[2] : input_shape[1];
-    const index_t in_width = isNCHW ? input_shape[3] : input_shape[2];
-
-    const index_t kernel_h = filter_shape[2];
-    const index_t kernel_w = filter_shape[3];
-
-    input_padding[0] = static_cast<int>((kernel_h -1) * 2 - padding_size[0]);
-    input_padding[1] = static_cast<int>((kernel_w -1) * 2 - padding_size[1]);
-    input_padding[0] = std::max<int>(0, input_padding[0]);
-    input_padding[1] = std::max<int>(0, input_padding[1]);
-
-    index_t out_height =
-        (in_height - 1) * strides[0] + kernel_h - padding_size[0];
-    index_t out_width =
-        (in_width - 1) * strides[1] + kernel_w - padding_size[1];
-
-    output_shape[0] = input_shape[0];
-    if (isNCHW) {
-      output_shape[1] = output_channel;
-      output_shape[2] = out_height;
-      output_shape[3] = out_width;
-    } else {
-      output_shape[1] = out_height;
-      output_shape[2] = out_width;
-      output_shape[3] = output_channel;
-    }
-  }
-
-  static void CalcDeconvPaddingAndInputSize(
-      const index_t *input_shape,   // NHWC
-      const index_t *filter_shape,  // OIHW
-      const int *strides,
-      Padding padding,
-      const index_t *output_shape,
-      int *padding_size,
-      const bool isNCHW = false) {
-    MACE_CHECK_NOTNULL(output_shape);
-    MACE_CHECK_NOTNULL(padding_size);
-    MACE_CHECK_NOTNULL(input_shape);
-    MACE_CHECK_NOTNULL(filter_shape);
-    MACE_CHECK_NOTNULL(strides);
-
-    const index_t in_height = isNCHW ? input_shape[2] : input_shape[1];
-    const index_t in_width = isNCHW ? input_shape[3] : input_shape[2];
-
-    const index_t out_height = isNCHW ? output_shape[2] : output_shape[1];
-    const index_t out_width = isNCHW ? output_shape[3] : output_shape[2];
-
-    const index_t extended_input_height = (in_height - 1) * strides[0] + 1;
-    const index_t extended_input_width = (in_width - 1) * strides[1] + 1;
-
-    const index_t filter_h = filter_shape[2];
-    const index_t filter_w = filter_shape[3];
-
-    index_t expected_input_height = 0, expected_input_width = 0;
-
-    switch (padding) {
-      case VALID:
-        expected_input_height =
-            (out_height - filter_h + strides[0]) / strides[0];
-        expected_input_width =
-            (out_width - filter_w + strides[1]) / strides[1];
-        break;
-      case SAME:
-        expected_input_height =
-            (out_height + strides[0] - 1) / strides[0];
-        expected_input_width =
-            (out_width + strides[1] - 1) / strides[1];
-        break;
-      default:
-        MACE_CHECK(false, "Unsupported padding type: ", padding);
-    }
-
-    MACE_CHECK(expected_input_height == in_height,
-               expected_input_height, "!=", in_height);
-    MACE_CHECK(expected_input_width == in_width,
-               expected_input_width, "!=", in_width);
-
-    const int p_h = static_cast<int>(out_height +
-        filter_h - 1 - extended_input_height);
-    const int p_w = static_cast<int>(out_width +
-        filter_w - 1 - extended_input_width);
-
-    padding_size[0] = std::max<int>(0, p_h);
-    padding_size[1] = std::max<int>(0, p_w);
-  }
-
- protected:
-  std::vector<int> strides_;  // [stride_h, stride_w]
-  const Padding padding_type_;
-  std::vector<int> paddings_;
-  const FrameworkType model_type_;
-  const ActivationType activation_;
-  const float relux_max_limit_;
-};
-
 template <DeviceType D, class T>
 class Deconv2dOp;

@@ -193,56 +63,65 @@ class Deconv2dOp<DeviceType::CPU, float> : public Deconv2dOpBase {
    MACE_CHECK_NOTNULL(filter);
    MACE_CHECK_NOTNULL(output);

-    std::vector<int> paddings(2);
-    std::vector<int> out_paddings(2);
-    std::vector<index_t> output_shape(4);
+    std::vector<int> in_paddings(2, 0);
+    std::vector<int> out_paddings(2, 0);
+    std::vector<index_t> out_shape(4, 0);
+    std::vector<index_t> padded_out_shape(4, 0);
+
    if (model_type_ == FrameworkType::TENSORFLOW) {  // tensorflow
-      paddings = std::vector<int>(2, 0);
      MACE_CHECK_NOTNULL(output_shape_tensor);
      MACE_CHECK(output_shape_tensor->size() == 4);
      Tensor::MappingGuard output_shape_mapper(output_shape_tensor);
      auto output_shape_data =
          output_shape_tensor->data<int32_t>();
-      output_shape =
+      out_shape =
          std::vector<index_t>(output_shape_data, output_shape_data + 4);

-      const index_t t = output_shape[1];
-      output_shape[1] = output_shape[3];
-      output_shape[3] = output_shape[2];
-      output_shape[2] = t;
+      const index_t t = out_shape[1];
+      out_shape[1] = out_shape[3];
+      out_shape[3] = out_shape[2];
+      out_shape[2] = t;

-      CalcDeconvPaddingAndInputSize(
+      CalcDeconvShape_TF(
          input->shape().data(),
          filter->shape().data(),
-          strides_.data(), padding_type_,
-          output_shape.data(),
-          paddings.data(), true);
+          out_shape.data(),
+          strides_.data(),
+          1,
+          padding_type_,
+          in_paddings.data(),
+          out_paddings.data(),
+          padded_out_shape.data(),
+          true);
    } else {  // caffe
-      out_paddings = paddings_;
-      output_shape = std::vector<index_t>(4, 0);
-      CalcDeconvOutputSize(input->shape().data(),
-                           filter->shape().data(),
-                           strides_.data(),
-                           output_shape.data(),
-                           out_paddings.data(),
-                           paddings.data(),
-                           true);
+      if (!paddings_.empty()) out_paddings = paddings_;
+      CalcDeconvShape_Caffe(
+          input->shape().data(),
+          filter->shape().data(),
+          strides_.data(),
+          out_paddings.data(),
+          1,
+          in_paddings.data(),
+          out_shape.data(),
+          padded_out_shape.data(),
+          true);
    }
-    MACE_RETURN_IF_ERROR(output->Resize(output_shape));
+    MACE_RETURN_IF_ERROR(output->Resize(out_shape));
+    output->Clear();
    index_t kernel_h = filter->dim(2);
    index_t kernel_w = filter->dim(3);
    const index_t *in_shape = input->shape().data();

-    MACE_CHECK(filter->dim(0) == output_shape[1], filter->dim(0), " != ",
-               output_shape[1]);
+    MACE_CHECK(filter->dim(0) == out_shape[1], filter->dim(0), " != ",
+               out_shape[1]);
    MACE_CHECK(filter->dim(1) == in_shape[1], filter->dim(1), " != ",
               in_shape[1]);
-    MACE_CHECK(in_shape[0] == output_shape[0],
+    MACE_CHECK(in_shape[0] == out_shape[0],
               "Input/Output batch size mismatch");
    std::function<void(const float *input,
                       const float *filter,
                       const index_t *in_shape,
-                       const index_t *out_shape,
+                       const index_t *output_shape,
                       float *output)> deconv_func;

    Tensor::MappingGuard input_mapper(input);
@@ -254,13 +133,9 @@ class Deconv2dOp<DeviceType::CPU, float> : public Deconv2dOpBase {
    auto bias_data = bias == nullptr ? nullptr : bias->data<float>();
    auto output_data = output->mutable_data<float>();

-    const index_t padded_out_h = (in_shape[2] - 1) * strides_[0] + kernel_h;
-    const index_t padded_out_w = (in_shape[3] - 1) * strides_[1] + kernel_w;
-    const index_t pad_h = (padded_out_h - output_shape[2]) / 2;
-    const index_t pad_w = (padded_out_w - output_shape[3]) / 2;
+    const index_t pad_h = out_paddings[0] / 2;
+    const index_t pad_w = out_paddings[1] / 2;

-    std::vector<index_t> padded_out_shape({output_shape[0], output_shape[1],
-                                           padded_out_h, padded_out_w});
    index_t padded_out_size =
        std::accumulate(padded_out_shape.begin(),
                        padded_out_shape.end(),
@@ -274,6 +149,11 @@ class Deconv2dOp<DeviceType::CPU, float> : public Deconv2dOpBase {
    padded_out.Clear();
    auto *padded_out_data = padded_out.mutable_data<float>();

+    bool use_neon_2x2_s1 = kernel_h == kernel_w && kernel_h == 2 &&
+        strides_[0] == strides_[1] && strides_[0] == 1;
+    bool use_neon_2x2_s2 = kernel_h == kernel_w && kernel_h == 2 &&
+        strides_[0] == strides_[1] && strides_[0] == 2;
+
    bool use_neon_3x3_s1 = kernel_h == kernel_w && kernel_h == 3 &&
        strides_[0] == strides_[1] && strides_[0] == 1;
    bool use_neon_3x3_s2 = kernel_h == kernel_w && kernel_h == 3 &&
@@ -284,73 +164,98 @@ class Deconv2dOp<DeviceType::CPU, float> : public Deconv2dOpBase {
    bool use_neon_4x4_s2 = kernel_h == kernel_w && kernel_h == 4 &&
        strides_[0] == strides_[1] && strides_[0] == 2;

-    if (use_neon_3x3_s1) {
+    if (use_neon_2x2_s1) {
      deconv_func = [=](const float *input,
                        const float *filter,
-                        const index_t *in_shape,
-                        const index_t *padded_out_shape,
+                        const index_t *input_shape,
+                        const index_t *padded_output_shape,
+                        float *padded_output) {
+        Deconv2dNeonK2x2S1(input,
+                           filter,
+                           input_shape,
+                           padded_output_shape,
+                           padded_output);
+      };
+    } else if (use_neon_2x2_s2) {
+      deconv_func = [=](const float *input,
+                        const float *filter,
+                        const index_t *input_shape,
+                        const index_t *padded_output_shape,
+                        float *padded_output) {
+        Deconv2dNeonK2x2S2(input,
+                           filter,
+                           input_shape,
+                           padded_output_shape,
+                           padded_output);
+      };
+    } else if (use_neon_3x3_s1) {
+      deconv_func = [=](const float *input,
+                        const float *filter,
+                        const index_t *input_shape,
+                        const index_t *padded_output_shape,
                        float *padded_output) {
        Deconv2dNeonK3x3S1(input,
                           filter,
-                           in_shape,
-                           padded_out_shape,
+                           input_shape,
+                           padded_output_shape,
                           padded_output);
      };
    } else if (use_neon_3x3_s2) {
      deconv_func = [=](const float *input,
                        const float *filter,
-                        const index_t *in_shape,
-                        const index_t *padded_out_shape,
+                        const index_t *input_shape,
+                        const index_t *padded_output_shape,
                        float *padded_output) {
        Deconv2dNeonK3x3S2(input,
                           filter,
-                           in_shape,
-                           padded_out_shape,
+                           input_shape,
+                           padded_output_shape,
                           padded_output);
      };
    } else if (use_neon_4x4_s1) {
      deconv_func = [=](const float *input,
                        const float *filter,
-                        const index_t *in_shape,
-                        const index_t *padded_out_shape,
+                        const index_t *input_shape,
+                        const index_t *padded_output_shape,
                        float *padded_output) {
        Deconv2dNeonK4x4S1(input,
                           filter,
-                           in_shape,
-                           padded_out_shape,
+                           input_shape,
+                           padded_output_shape,
                           padded_output);
      };
    } else if (use_neon_4x4_s2) {
      deconv_func = [=](const float *input,
                        const float *filter,
-                        const index_t *in_shape,
-                        const index_t *padded_out_shape,
+                        const index_t *input_shape,
+                        const index_t *padded_output_shape,
                        float *padded_output) {
        Deconv2dNeonK4x4S2(input,
                           filter,
-                           in_shape,
-                           padded_out_shape,
+                           input_shape,
+                           padded_output_shape,
                           padded_output);
      };
    } else {
      deconv_func = [=](const float *input,
                        const float *filter,
-                        const index_t *in_shape,
-                        const index_t *padded_out_shape,
+                        const index_t *input_shape,
+                        const index_t *padded_output_shape,
                        float *padded_output) {
        Deconv2dGeneral(input,
                        filter,
                        kernel_h,
                        kernel_w,
                        strides_.data(),
-                        in_shape,
-                        padded_out_shape,
+                        input_shape,
+                        padded_output_shape,
                        padded_output);
      };
    }

    bool no_pad =
-        padded_out_h == output_shape[2] && padded_out_w == output_shape[3];
+        (padded_out_shape[2] == out_shape[2]) &&
+            (padded_out_shape[3] == out_shape[3]);
    float *out_data = no_pad ? output_data : padded_out_data;

    deconv_func(input_data,
@@ -361,16 +266,16 @@ class Deconv2dOp<DeviceType::CPU, float> : public Deconv2dOpBase {
    if (!no_pad) {
      CropPadOut<float>(out_data,
                        padded_out_shape.data(),
-                        output_shape.data(),
+                        out_shape.data(),
                        pad_h,
                        pad_w,
                        output_data);
    }

    if (bias_data != nullptr) {
-      const index_t batch = output_shape[0];
-      const index_t channels = output_shape[1];
-      const index_t img_size = output_shape[2] * output_shape[3];
+      const index_t batch = out_shape[0];
+      const index_t channels = out_shape[1];
+      const index_t img_size = out_shape[2] * out_shape[3];
 #pragma omp parallel for collapse(3)
      for (index_t b = 0; b < batch; ++b) {
        for (index_t c = 0; c < channels; ++c) {
@@ -476,39 +381,46 @@ class Deconv2dOp<DeviceType::GPU, T> : public Deconv2dOpBase {
    MACE_CHECK_NOTNULL(input);
    MACE_CHECK_NOTNULL(filter);
    MACE_CHECK_NOTNULL(output);
-    std::vector<int> paddings(2);
-    std::vector<int> out_paddings(2);
-    std::vector<index_t> output_shape(4);
+
+    std::vector<int> in_paddings(2, 0);
+    std::vector<index_t> out_shape(4, 0);
+
    if (model_type_ == FrameworkType::TENSORFLOW) {
-      paddings = std::vector<int>(2, 0);
      MACE_CHECK_NOTNULL(output_shape_tensor);
      MACE_CHECK(output_shape_tensor->size() == 4);
      Tensor::MappingGuard output_shape_mapper(output_shape_tensor);
      auto output_shape_data =
          output_shape_tensor->data<int32_t>();
-      output_shape =
+      out_shape =
          std::vector<index_t>(output_shape_data, output_shape_data + 4);
-      CalcDeconvPaddingAndInputSize(input->shape().data(),
-                                    filter->shape().data(),
-                                    strides_.data(),
-                                    padding_type_,
-                                    output_shape.data(),
-                                    paddings.data());
+
+      CalcDeconvShape_TF(
+          input->shape().data(),
+          filter->shape().data(),
+          out_shape.data(),
+          strides_.data(),
+          1,
+          padding_type_,
+          in_paddings.data(),
+          nullptr,
+          nullptr);
    } else {
-      out_paddings = paddings_;
-      paddings = std::vector<int>(2, 0);
-      output_shape = std::vector<index_t>(4, 0);
-      CalcDeconvOutputSize(input->shape().data(),
-                           filter->shape().data(),
-                           strides_.data(),
-                           output_shape.data(),
-                           out_paddings.data(),
-                           paddings.data());
+      std::vector<int> out_paddings(2, 0);
+      if (!paddings_.empty()) out_paddings = paddings_;
+      CalcDeconvShape_Caffe(
+          input->shape().data(),
+          filter->shape().data(),
+          strides_.data(),
+          out_paddings.data(),
+          1,
+          in_paddings.data(),
+          out_shape.data(),
+          nullptr);
    }

    return kernel_->Compute(context, input, filter, bias,
-                            strides_.data(), paddings.data(), activation_,
-                            relux_max_limit_, output_shape, output);
+                            strides_.data(), in_paddings.data(), activation_,
+                            relux_max_limit_, out_shape, output);
  }

 private:

--- a/mace/ops/deconv_2d.h
+++ b/mace/ops/deconv_2d.h
@@ -15,7 +15,14 @@
 #ifndef MACE_OPS_DECONV_2D_H_
 #define MACE_OPS_DECONV_2D_H_

+#include <algorithm>
+#include <string>
+#include <vector>
+
+#include "mace/core/operator.h"
 #include "mace/core/types.h"
+#include "mace/ops/activation.h"
+#include "mace/ops/conv_pool_2d_util.h"

 namespace mace {
 namespace ops {
@@ -25,6 +32,167 @@ enum FrameworkType {
  CAFFE = 1,
 };

+class Deconv2dOpBase : public Operation {
+ public:
+  explicit Deconv2dOpBase(OpConstructContext *context)
+      : Operation(context),
+        strides_(Operation::GetRepeatedArgs<int>("strides")),
+        padding_type_(static_cast<Padding>(Operation::GetOptionalArg<int>(
+            "padding", static_cast<int>(SAME)))),
+        paddings_(Operation::GetRepeatedArgs<int>("padding_values")),
+        group_(Operation::GetOptionalArg<int>("group", 1)),
+        model_type_(static_cast<ops::FrameworkType>(
+                        Operation::GetOptionalArg<int>("framework_type", 0))),
+        activation_(ops::StringToActivationType(
+            Operation::GetOptionalArg<std::string>("activation",
+                                                   "NOOP"))),
+        relux_max_limit_(
+            Operation::GetOptionalArg<float>("max_limit", 0.0f)) {}
+
+  static void CalcDeconvShape_Caffe(
+      const index_t *input_shape,   // NHWC
+      const index_t *filter_shape,  // OIHW
+      const int *strides,
+      const int *out_paddings,
+      const int group,
+      int *in_paddings,
+      index_t *out_shape,
+      index_t *padded_out_shape,
+      const bool isNCHW = false) {
+    MACE_CHECK_NOTNULL(out_paddings);
+    MACE_CHECK_NOTNULL(input_shape);
+    MACE_CHECK_NOTNULL(filter_shape);
+    MACE_CHECK_NOTNULL(strides);
+
+    const index_t in_height = isNCHW ? input_shape[2] : input_shape[1];
+    const index_t in_width = isNCHW ? input_shape[3] : input_shape[2];
+
+    const index_t output_channel = filter_shape[0] * group;
+
+    const index_t kernel_h = filter_shape[2];
+    const index_t kernel_w = filter_shape[3];
+
+    index_t padded_out_height =
+        (in_height - 1) * strides[0] + kernel_h;
+    index_t padded_out_width =
+        (in_width - 1) * strides[1] + kernel_w;
+
+    if (in_paddings != nullptr) {
+      in_paddings[0] = static_cast<int>((kernel_h - 1) * 2 - out_paddings[0]);
+      in_paddings[1] = static_cast<int>((kernel_w - 1) * 2 - out_paddings[1]);
+      in_paddings[0] = std::max<int>(0, in_paddings[0]);
+      in_paddings[1] = std::max<int>(0, in_paddings[1]);
+    }
+
+    if (padded_out_shape != nullptr) {
+      padded_out_shape[0] = input_shape[0];
+      padded_out_shape[1] = isNCHW ? output_channel : padded_out_height;
+      padded_out_shape[2] = isNCHW ? padded_out_height : padded_out_width;
+      padded_out_shape[3] = isNCHW ? padded_out_width : output_channel;
+    }
+
+    if (out_shape != nullptr) {
+      index_t out_height = padded_out_height - out_paddings[0];
+      index_t out_width = padded_out_width - out_paddings[1];
+      out_shape[0] = input_shape[0];
+      out_shape[1] = isNCHW ? output_channel : out_height;
+      out_shape[2] = isNCHW ? out_height : out_width;
+      out_shape[3] = isNCHW ? out_width : output_channel;
+    }
+  }
+
+  static void CalcDeconvShape_TF(
+      const index_t *input_shape,   // NHWC
+      const index_t *filter_shape,  // OIHW
+      const index_t *output_shape,
+      const int *strides,
+      const int group,
+      Padding padding_type,
+      int *in_paddings,
+      int *out_paddings,
+      index_t *padded_out_shape,
+      const bool isNCHW = false) {
+    MACE_CHECK_NOTNULL(output_shape);
+    MACE_CHECK_NOTNULL(input_shape);
+    MACE_CHECK_NOTNULL(filter_shape);
+    MACE_CHECK_NOTNULL(strides);
+
+    const index_t in_height = isNCHW ? input_shape[2] : input_shape[1];
+    const index_t in_width = isNCHW ? input_shape[3] : input_shape[2];
+
+    const index_t out_height = isNCHW ? output_shape[2] : output_shape[1];
+    const index_t out_width = isNCHW ? output_shape[3] : output_shape[2];
+
+    const index_t extended_in_height = (in_height - 1) * strides[0] + 1;
+    const index_t extended_in_width = (in_width - 1) * strides[1] + 1;
+
+    const index_t kernel_h = filter_shape[2];
+    const index_t kernel_w = filter_shape[3];
+
+    index_t expected_input_height = 0, expected_input_width = 0;
+
+    switch (padding_type) {
+      case VALID:
+        expected_input_height =
+            (out_height - kernel_h + strides[0]) / strides[0];
+        expected_input_width =
+            (out_width - kernel_w + strides[1]) / strides[1];
+        break;
+      case SAME:
+        expected_input_height =
+            (out_height + strides[0] - 1) / strides[0];
+        expected_input_width =
+            (out_width + strides[1] - 1) / strides[1];
+        break;
+      default:
+        MACE_CHECK(false, "Unsupported padding type: ", padding_type);
+    }
+
+    MACE_CHECK(expected_input_height == in_height,
+               expected_input_height, "!=", in_height);
+    MACE_CHECK(expected_input_width == in_width,
+               expected_input_width, "!=", in_width);
+
+    const index_t padded_out_height =
+        (in_height - 1) * strides[0] + kernel_h;
+    const index_t padded_out_width =
+        (in_width - 1) * strides[1] + kernel_w;
+
+    if (in_paddings != nullptr) {
+      const int p_h =
+          static_cast<int>(out_height + kernel_h - 1 - extended_in_height);
+      const int p_w =
+          static_cast<int>(out_width + kernel_w - 1 - extended_in_width);
+      in_paddings[0] = std::max<int>(0, p_h);
+      in_paddings[1] = std::max<int>(0, p_w);
+    }
+
+    if (out_paddings != nullptr) {
+      const int o_p_h = static_cast<int>(padded_out_height - out_height);
+      const int o_p_w = static_cast<int>(padded_out_width - out_width);
+      out_paddings[0] = std::max<int>(0, o_p_h);
+      out_paddings[1] = std::max<int>(0, o_p_w);
+    }
+
+    if (padded_out_shape != nullptr) {
+      index_t output_channel = filter_shape[0] * group;
+      padded_out_shape[0] = output_shape[0];
+      padded_out_shape[1] = isNCHW ? output_channel : padded_out_height;
+      padded_out_shape[2] = isNCHW ? padded_out_height : padded_out_width;
+      padded_out_shape[3] = isNCHW ? padded_out_width : output_channel;
+    }
+  }
+
+ protected:
+  std::vector<int> strides_;  // [stride_h, stride_w]
+  const Padding padding_type_;
+  std::vector<int> paddings_;
+  const int group_;
+  const FrameworkType model_type_;
+  const ActivationType activation_;
+  const float relux_max_limit_;
+};
+
 template <typename T>
 void CropPadOut(const T *input,
                const index_t *in_shape,

--- a/mace/ops/deconv_2d_test.cc
+++ b/mace/ops/deconv_2d_test.cc
@@ -370,9 +370,9 @@ TEST_F(Deconv2dOpTest, OPENCLSimple3X3PaddingValid_S2) {

 namespace {
 template <DeviceType D, typename T>
-void TestComplexDeconvNxNS12(const int batch,
-                             const std::vector<int> &shape,
-                             const int stride) {
+void TestComplexDeconvNxN(const int batch,
+                          const std::vector<int> &shape,
+                          const int stride) {
  testing::internal::LogToStderr();
  auto func = [&](int kernel_h, int kernel_w, int stride_h, int stride_w,
                  Padding type, int padding) {
@@ -415,8 +415,6 @@ void TestComplexDeconvNxNS12(const int batch,
      output_shape.push_back(output_channels);
      net.AddInputFromArray<D, int32_t>("OutputShape", {4}, output_shape);
    } else {
-      //      out_h = (height - 1) * stride + 1 + padding - kernel_h + 1;
-      //      out_w = (width -1) * stride + 1 + padding - kernel_w + 1;
      paddings.push_back(padding);
      paddings.push_back(padding);
    }
@@ -497,38 +495,42 @@ void TestComplexDeconvNxNS12(const int batch,
                            1e-4);
  };

-  for (int kernel_size : {3, 4, 5, 7}) {
-    func(kernel_size, kernel_size, stride, stride, VALID, -1);
-    func(kernel_size, kernel_size, stride, stride, SAME, -1);
-    func(kernel_size, kernel_size, stride, stride, VALID, 2);
-    func(kernel_size, kernel_size, stride, stride, VALID, 3);
+  for (int kernel_size : {2, 3, 4, 5, 7}) {
+    if (kernel_size >= stride) {
+      func(kernel_size, kernel_size, stride, stride, VALID, -1);
+      func(kernel_size, kernel_size, stride, stride, SAME, -1);
+      func(kernel_size, kernel_size, stride, stride, VALID, 1);
+      func(kernel_size, kernel_size, stride, stride, VALID, 2);
+      func(kernel_size, kernel_size, stride, stride, VALID, 3);
+    }
  }
 }
 }  // namespace

+
 TEST_F(Deconv2dOpTest, OPENCLAlignedDeconvNxNS12) {
-  TestComplexDeconvNxNS12<DeviceType::GPU, float>(1, {32, 16, 16, 32}, 1);
-  TestComplexDeconvNxNS12<DeviceType::GPU, float>(1, {32, 16, 16, 32}, 2);
+  TestComplexDeconvNxN<DeviceType::GPU, float>(1, {32, 16, 16, 32}, 1);
+  TestComplexDeconvNxN<DeviceType::GPU, float>(1, {32, 16, 16, 32}, 2);
 }

 TEST_F(Deconv2dOpTest, OPENCLAlignedDeconvNxNS34) {
-  TestComplexDeconvNxNS12<DeviceType::GPU, float>(1, {32, 16, 16, 32}, 3);
-  TestComplexDeconvNxNS12<DeviceType::GPU, float>(1, {32, 16, 16, 32}, 4);
+  TestComplexDeconvNxN<DeviceType::GPU, float>(1, {32, 16, 16, 32}, 3);
+  TestComplexDeconvNxN<DeviceType::GPU, float>(1, {32, 16, 16, 32}, 4);
 }

 TEST_F(Deconv2dOpTest, OPENCLUnalignedDeconvNxNS12) {
-  TestComplexDeconvNxNS12<DeviceType::GPU, float>(1, {17, 113, 5, 7}, 1);
-  TestComplexDeconvNxNS12<DeviceType::GPU, float>(1, {17, 113, 5, 7}, 2);
+  TestComplexDeconvNxN<DeviceType::GPU, float>(1, {17, 113, 5, 7}, 1);
+  TestComplexDeconvNxN<DeviceType::GPU, float>(1, {17, 113, 5, 7}, 2);
 }

 TEST_F(Deconv2dOpTest, OPENCLUnalignedDeconvNxNS34) {
-  TestComplexDeconvNxNS12<DeviceType::GPU, float>(1, {17, 113, 5, 7}, 3);
-  TestComplexDeconvNxNS12<DeviceType::GPU, float>(1, {17, 113, 5, 7}, 4);
+  TestComplexDeconvNxN<DeviceType::GPU, float>(1, {17, 113, 5, 7}, 3);
+  TestComplexDeconvNxN<DeviceType::GPU, float>(1, {17, 113, 5, 7}, 4);
 }

 TEST_F(Deconv2dOpTest, OPENCLUnalignedDeconvNxNMultiBatch) {
-  TestComplexDeconvNxNS12<DeviceType::GPU, float>(3, {17, 113, 5, 7}, 1);
-  TestComplexDeconvNxNS12<DeviceType::GPU, float>(5, {17, 113, 5, 7}, 2);
+  TestComplexDeconvNxN<DeviceType::GPU, float>(3, {17, 113, 5, 7}, 1);
+  TestComplexDeconvNxN<DeviceType::GPU, float>(5, {17, 113, 5, 7}, 2);
 }

 }  // namespace test

--- a/mace/ops/depthwise_deconv2d.cc
+++ b/mace/ops/depthwise_deconv2d.cc
@@ -24,11 +24,8 @@
 #include <vector>

 #include "mace/core/future.h"
-#include "mace/core/operator.h"
 #include "mace/core/tensor.h"
-#include "mace/ops/activation.h"
 #include "mace/ops/arm/depthwise_deconv2d_neon.h"
-#include "mace/ops/conv_pool_2d_util.h"
 #include "mace/utils/utils.h"
 #include "mace/public/mace.h"
 #ifdef MACE_ENABLE_OPENCL
@@ -38,90 +35,15 @@
 namespace mace {
 namespace ops {

-class DepthwiseDeconv2dOpBase : public Operation {
- public:
-  explicit DepthwiseDeconv2dOpBase(OpConstructContext *context)
-      : Operation(context),
-        strides_(Operation::GetRepeatedArgs<int>("strides")),
-        paddings_(Operation::GetRepeatedArgs<int>("padding_values")),
-        group_(Operation::GetOptionalArg<int>("group", 1)),
-        activation_(ops::StringToActivationType(
-            Operation::GetOptionalArg<std::string>("activation",
-                                                      "NOOP"))),
-        relux_max_limit_(Operation::GetOptionalArg<float>("max_limit",
-                                                             0.0f)) {}
-
-  static void CalcGroupDeconvOutputSize(
-      const index_t *input_shape,   // NHWC
-      const index_t *filter_shape,  // OIHW
-      const int group,
-      const int *strides,
-      const int *paddings,
-      int *pre_paddings,
-      index_t *out_shape,
-      index_t *padded_out_shape,
-      const bool isNCHW = false) {
-    MACE_CHECK_NOTNULL(paddings);
-    MACE_CHECK_NOTNULL(input_shape);
-    MACE_CHECK_NOTNULL(filter_shape);
-    MACE_CHECK_NOTNULL(strides);
-
-    const index_t in_height = isNCHW ? input_shape[2] : input_shape[1];
-    const index_t in_width = isNCHW ? input_shape[3] : input_shape[2];
-
-    const index_t output_channel = filter_shape[0] * group;
-
-    const index_t kernel_h = filter_shape[2];
-    const index_t kernel_w = filter_shape[3];
-
-    index_t padded_out_height =
-        (in_height - 1) * strides[0] + kernel_h;
-    index_t padded_out_width =
-        (in_width - 1) * strides[1] + kernel_w;
-
-    if (pre_paddings != nullptr) {
-      pre_paddings[0] = static_cast<int>((kernel_h - 1) * 2 - paddings[0]);
-      pre_paddings[1] = static_cast<int>((kernel_w - 1) * 2 - paddings[1]);
-      pre_paddings[0] = std::max<int>(0, pre_paddings[0]);
-      pre_paddings[1] = std::max<int>(0, pre_paddings[1]);
-    }
-
-    if (padded_out_shape != nullptr) {
-      padded_out_shape[0] = input_shape[0];
-      padded_out_shape[1] = isNCHW ? output_channel : padded_out_height;
-      padded_out_shape[2] = isNCHW ? padded_out_height : padded_out_width;
-      padded_out_shape[3] = isNCHW ? padded_out_width : output_channel;
-    }
-
-    if (out_shape != nullptr) {
-      index_t out_height = padded_out_height - paddings[0];
-      index_t out_width = padded_out_width - paddings[1];
-      out_shape[0] = input_shape[0];
-      out_shape[1] = isNCHW ? output_channel : out_height;
-      out_shape[2] = isNCHW ? out_height : out_width;
-      out_shape[3] = isNCHW ? out_width : output_channel;
-    }
-  }
-
- protected:
-  std::vector<int> strides_;  // [stride_h, stride_w]
-  std::vector<int> paddings_;
-  const int group_;
-  const ActivationType activation_;
-  const float relux_max_limit_;
-};
-
-
-
 template <DeviceType D, class T>
 class DepthwiseDeconv2dOp;

 template<>
 class DepthwiseDeconv2dOp<DeviceType::CPU, float>
-    : public DepthwiseDeconv2dOpBase {
+    : public Deconv2dOpBase {
 public:
  explicit DepthwiseDeconv2dOp(OpConstructContext *context)
-      : DepthwiseDeconv2dOpBase(context) {}
+      : Deconv2dOpBase(context) {}

  MaceStatus Run(OpContext *context) override {
    const Tensor *input = this->Input(0);
@@ -138,15 +60,17 @@ class DepthwiseDeconv2dOp<DeviceType::CPU, float>
    std::vector<index_t> padded_out_shape(4, 0);

    if (!paddings_.empty()) out_paddings = paddings_;
-    CalcGroupDeconvOutputSize(input->shape().data(),
-                              filter->shape().data(),
-                              group_,
-                              strides_.data(),
-                              out_paddings.data(),
-                              nullptr,
-                              out_shape.data(),
-                              padded_out_shape.data(),
-                              true);
+    CalcDeconvShape_Caffe(
+        input->shape().data(),
+        filter->shape().data(),
+        strides_.data(),
+        out_paddings.data(),
+        group_,
+        nullptr,
+        out_shape.data(),
+        padded_out_shape.data(),
+        true);
+
    MACE_RETURN_IF_ERROR(output->Resize(out_shape));
    output->Clear();
    index_t kernel_h = filter->dim(2);
@@ -480,10 +404,10 @@ class DepthwiseDeconv2dOp<DeviceType::CPU, float>

 #ifdef MACE_ENABLE_OPENCL
 template <typename T>
-class DepthwiseDeconv2dOp<DeviceType::GPU, T> : public DepthwiseDeconv2dOpBase {
+class DepthwiseDeconv2dOp<DeviceType::GPU, T> : public Deconv2dOpBase {
 public:
  explicit DepthwiseDeconv2dOp(OpConstructContext *context)
-      : DepthwiseDeconv2dOpBase(context) {
+      : Deconv2dOpBase(context) {
    if (context->device()->opencl_runtime()->UseImageMemory()) {
      kernel_.reset(new opencl::image::DepthwiseDeconv2dKernel<T>);
    } else {
@@ -501,16 +425,18 @@ class DepthwiseDeconv2dOp<DeviceType::GPU, T> : public DepthwiseDeconv2dOpBase {
    MACE_CHECK_NOTNULL(output);

    std::vector<int> in_paddings(2, 0);
+    std::vector<int> out_paddings(2, 0);
    std::vector<index_t> out_shape(4, 0);

-    CalcGroupDeconvOutputSize(input->shape().data(),
-                              filter->shape().data(),
-                              group_,
-                              strides_.data(),
-                              paddings_.data(),
-                              in_paddings.data(),
-                              out_shape.data(),
-                              nullptr);
+    if (!paddings_.empty()) out_paddings = paddings_;
+    CalcDeconvShape_Caffe(input->shape().data(),
+                          filter->shape().data(),
+                          strides_.data(),
+                          out_paddings.data(),
+                          group_,
+                          in_paddings.data(),
+                          out_shape.data(),
+                          nullptr);

    return kernel_->Compute(context,
                            input,