diff --git a/paddle/function/neon/NeonDepthwiseConv.cpp b/paddle/function/neon/NeonDepthwiseConv.cpp
index bd9a56a8a520fe0caadaf161ecd8295653ece826..18126152ea0b4ebfe4ec5c8084479787814ed173 100644
--- a/paddle/function/neon/NeonDepthwiseConv.cpp
+++ b/paddle/function/neon/NeonDepthwiseConv.cpp
@@ -64,9 +64,10 @@ public:
 
     // padding the input
     float* inputPadding = inputData;
+    int padInputHeight = inputHeight + 2 * paddingH();
+    int padInputWidth = inputWidth + 2 * paddingW();
     if (paddingH() > 0 || paddingW() > 0) {
-      int newSize = batchSize * inputChannels * (inputHeight + 2 * paddingH()) *
-                    (inputWidth + 2 * paddingW());
+      int newSize = batchSize * inputChannels * padInputHeight * padInputWidth;
       resizeBuffer<Device>(newSize);
       inputPadding = reinterpret_cast<float*>(memory_->getBuf());
       neon::Padding<float>::run(inputData,
@@ -74,12 +75,8 @@ public:
                                 batchSize * inputChannels,
                                 inputHeight,
                                 inputWidth,
-                                paddingH(),
-                                paddingW());
-
-      // height and width of padding data
-      inputHeight += 2 * paddingH();
-      inputWidth += 2 * paddingW();
+                                padInputHeight,
+                                padInputWidth);
     }
 
     std::function<void(
@@ -101,14 +98,14 @@ public:
     for (int i = 0; i < batchSize; i++) {
       DepthWiseConv(inputPadding,
                     filterData,
-                    inputHeight,
-                    inputWidth,
+                    padInputHeight,
+                    padInputWidth,
                     outputChannels,
                     outputHeight,
                     outputWidth,
                     filterMultiplier,
                     outputData);
-      inputPadding += inputChannels * inputHeight * inputWidth;
+      inputPadding += inputChannels * padInputHeight * padInputWidth;
       outputData += outputChannels * outputHeight * outputWidth;
     }
   }
diff --git a/paddle/function/neon/NeonDepthwiseConv.h b/paddle/function/neon/NeonDepthwiseConv.h
index 3ceaa65ddb9027563a1ec9cea68cf8175fa30769..30f0158c61e4753cb87f45453199053c58b91673 100644
--- a/paddle/function/neon/NeonDepthwiseConv.h
+++ b/paddle/function/neon/NeonDepthwiseConv.h
@@ -477,39 +477,40 @@ struct DepthwiseConvKernel<4, 2> {
 
 template <class T>
 struct Padding {
-  static void run(const T* src,
-                  T* dest,
+  static void run(const T* input,
+                  T* inputPadding,
                   int channels,
                   int inputHeight,
                   int inputWidth,
-                  int paddingHeight,
-                  int paddingWidth) {
-    const int destWidth = inputWidth + 2 * paddingWidth;
+                  int padInputHeight,
+                  int padInputWidth) {
+    const int paddingHeight = (padInputHeight - inputHeight) / 2;
+    const int paddingWidth = (padInputWidth - inputWidth) / 2;
     for (int c = 0; c < channels; c++) {
       if (paddingHeight > 0) {
-        memset(dest, 0, destWidth * paddingHeight * sizeof(T));
-        dest += destWidth * paddingHeight;
+        memset(inputPadding, 0, padInputWidth * paddingHeight * sizeof(T));
+        inputPadding += padInputWidth * paddingHeight;
       }
 
       for (int i = 0; i < inputHeight; i++) {
         // padding head
         for (int j = 0; j < paddingWidth; j++) {
-          *dest++ = T(0);
+          *inputPadding++ = T(0);
         }
 
-        memcpy(dest, src, inputWidth * sizeof(T));
-        dest += inputWidth;
-        src += inputWidth;
+        memcpy(inputPadding, input, inputWidth * sizeof(T));
+        inputPadding += inputWidth;
+        input += inputWidth;
 
         // padding tail
         for (int j = 0; j < paddingWidth; j++) {
-          *dest++ = T(0);
+          *inputPadding++ = T(0);
         }
       }
 
       if (paddingHeight > 0) {
-        memset(dest, 0, destWidth * paddingHeight * sizeof(T));
-        dest += destWidth * paddingHeight;
+        memset(inputPadding, 0, padInputWidth * paddingHeight * sizeof(T));
+        inputPadding += padInputWidth * paddingHeight;
       }
     }
   }
@@ -518,47 +519,48 @@ struct Padding {
 #if defined(__ARM_NEON__) || defined(__ARM_NEON)
 template <>
 struct Padding<float> {
-  static void run(const float* src,
-                  float* dest,
+  static void run(const float* input,
+                  float* inputPadding,
                   int channels,
                   int inputHeight,
                   int inputWidth,
-                  int paddingHeight,
-                  int paddingWidth) {
-    const int destWidth = inputWidth + 2 * paddingWidth;
+                  int padInputHeight,
+                  int padInputWidth) {
+    const int paddingHeight = (padInputHeight - inputHeight) / 2;
+    const int paddingWidth = (padInputWidth - inputWidth) / 2;
     for (int c = 0; c < channels; c++) {
       if (paddingHeight > 0) {
-        memset(dest, 0, destWidth * paddingHeight * sizeof(float));
-        dest += destWidth * paddingHeight;
+        memset(inputPadding, 0, padInputWidth * paddingHeight * sizeof(float));
+        inputPadding += padInputWidth * paddingHeight;
       }
 
       for (int i = 0; i < inputHeight; i++) {
         // padding head
         for (int j = 0; j < paddingWidth; j++) {
-          *dest++ = float(0);
+          *inputPadding++ = float(0);
         }
 
         int step = inputWidth >> 2;
         int remain = inputWidth & 3;
         for (int s = 0; s < step; s++) {
-          float32x4_t s0 = vld1q_f32(src);
-          vst1q_f32(dest, s0);
-          src += 4;
-          dest += 4;
+          float32x4_t s0 = vld1q_f32(input);
+          vst1q_f32(inputPadding, s0);
+          input += 4;
+          inputPadding += 4;
         }
         for (int r = 0; r < remain; r++) {
-          *dest++ = *src++;
+          *inputPadding++ = *input++;
         }
 
         // padding tail
         for (int j = 0; j < paddingWidth; j++) {
-          *dest++ = float(0);
+          *inputPadding++ = float(0);
         }
       }
 
       if (paddingHeight > 0) {
-        memset(dest, 0, destWidth * paddingHeight * sizeof(float));
-        dest += destWidth * paddingHeight;
+        memset(inputPadding, 0, padInputWidth * paddingHeight * sizeof(float));
+        inputPadding += padInputWidth * paddingHeight;
       }
     }
   }
diff --git a/paddle/function/neon/NeonDepthwiseConvTranspose.cpp b/paddle/function/neon/NeonDepthwiseConvTranspose.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..03d571ecfe979ed5386cf8f857c5694c57843a12
--- /dev/null
+++ b/paddle/function/neon/NeonDepthwiseConvTranspose.cpp
@@ -0,0 +1,124 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "NeonDepthwiseConv.h"
+#include "paddle/function/ConvOp.h"
+
+namespace paddle {
+
+#if defined(__ARM_NEON__) || defined(__ARM_NEON)
+
+template <DeviceType Device>
+class NeonDepthwiseConvTransposeFunction : public ConvFunctionBase {
+public:
+  void init(const FuncConfig& config) override {
+    ConvFunctionBase::init(config);
+  }
+
+  void check(const BufferArgs& inputs, const BufferArgs& outputs) override {
+    const TensorShape& input = inputs[0].shape();
+    const TensorShape& filter = inputs[1].shape();
+    const TensorShape& output = outputs[0].shape();
+    checkShape(input, filter, output);
+  }
+
+  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
+    CHECK_EQ(numInputs_, inputs.size());
+    CHECK_EQ(numOutputs_, outputs.size());
+    check(inputs, outputs);
+
+    const TensorShape& input = inputs[0].shape();
+    const TensorShape& filter = inputs[1].shape();
+    const TensorShape& output = outputs[0].shape();
+
+    int batchSize = input[0];
+    int inputChannels = input[1];
+    int inputHeight = input[2];
+    int inputWidth = input[3];
+    int filterHeight = getFilterHeight(filter);
+    int filterWidth = getFilterWidth(filter);
+    int outputChannels = output[1];
+    int outputHeight = output[2];
+    int outputWidth = output[3];
+    int filterMultiplier = outputChannels / groups_;
+    CHECK_EQ(inputChannels, groups_);
+
+    // only support strideH() == strideW() and filterHeight == filterWidth.
+    CHECK_EQ(strideH(), strideW());
+    CHECK_EQ(paddingH(), paddingW());
+    CHECK_EQ(filterHeight, filterWidth);
+
+    float* inputData = inputs[0].data<float>();
+    float* filterData = inputs[1].data<float>();
+    float* outputData = outputs[0].data<float>();
+
+    // padding the input, input -> inputPadding
+    float* inputPadding = inputData;
+    int padInputHeight =
+        (inputHeight - 1) * strideH() + 2 * filterHeight - 1 - 2 * paddingH();
+    int padInputWidth =
+        (inputWidth - 1) * strideW() + 2 * filterWidth - 1 - 2 * paddingW();
+
+    if (padInputHeight > inputHeight || padInputWidth > inputWidth) {
+      int newSize = batchSize * inputChannels * padInputHeight * padInputWidth;
+      resizeBuffer<Device>(newSize);
+      inputPadding = reinterpret_cast<float*>(memory_->getBuf());
+      neon::Padding<float>::run(inputData,
+                                inputPadding,
+                                batchSize * inputChannels,
+                                inputHeight,
+                                inputWidth,
+                                padInputHeight,
+                                padInputWidth);
+    }
+
+    std::function<void(
+        const float*, const float*, int, int, int, int, int, int, float*)>
+        DepthWiseConv;
+
+    if (filterWidth == 3) {
+      DepthWiseConv = neon::DepthwiseConvKernel<3, 1>::run;
+    } else if (filterWidth == 4) {
+      DepthWiseConv = neon::DepthwiseConvKernel<4, 1>::run;
+    } else {
+      LOG(FATAL) << "Not supported";
+    }
+
+    for (int i = 0; i < batchSize; i++) {
+      DepthWiseConv(inputPadding,
+                    filterData,
+                    padInputHeight,
+                    padInputWidth,
+                    outputChannels,
+                    outputHeight,
+                    outputWidth,
+                    filterMultiplier,
+                    outputData);
+      inputPadding += inputChannels * padInputHeight * padInputWidth;
+      outputData += outputChannels * outputHeight * outputWidth;
+    }
+  }
+};
+
+#ifndef PADDLE_TYPE_DOUBLE
+
+REGISTER_TYPED_FUNC(NeonDepthwiseConvTranspose,
+                    CPU,
+                    NeonDepthwiseConvTransposeFunction);
+
+#endif
+
+#endif
+
+}  // namespace paddle