diff --git a/paddle/function/CMakeLists.txt b/paddle/function/CMakeLists.txt
index 1518a8a654cfb54376a49760dc5873733c916937..8330c2be74a02eee2edd2d5836ae3b15afba290c 100644
--- a/paddle/function/CMakeLists.txt
+++ b/paddle/function/CMakeLists.txt
@@ -37,6 +37,7 @@ if(WITH_GPU)
     add_simple_unittest(MulOpTest)
     add_simple_unittest(CosSimOpTest)
     add_simple_unittest(RowConvOpTest)
+    add_simple_unittest(DepthwiseConvOpTest)
 endif()
 
 add_simple_unittest(ConvOpTest)
diff --git a/paddle/function/DepthwiseConvOpTest.cpp b/paddle/function/DepthwiseConvOpTest.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..6d0cc6f75dc63313a8e0a62db9e690b64ddd79c8
--- /dev/null
+++ b/paddle/function/DepthwiseConvOpTest.cpp
@@ -0,0 +1,208 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+#include <memory>
+#include "Function.h"
+#include "FunctionTest.h"
+
+namespace paddle {
+
+enum TestType {
+  kForwardTest = 0,
+  kBackwardInputTest = 1,
+  kBackwardFilterTest = 2,
+};
+
+template <DeviceType DType1, DeviceType DType2>
+class DepthwiseConvolutionTest {
+public:
+  DepthwiseConvolutionTest(const std::string& conv1,
+                           const std::string& conv2,
+                           TestType type,
+                           std::string algo = "auto") {
+    for (size_t batchSize : {1, 32}) {
+      for (size_t inputSize : {7, 14, 54}) {
+        for (size_t filterSize : {1, 3, 5}) {
+          for (size_t inputChannels : {64, 128}) {
+            size_t outputChannels = inputChannels;
+            for (size_t stride : {1, 2}) {
+              for (size_t padding : {0, 1}) {
+                if (padding >= filterSize) break;
+                size_t outputSize =
+                    (inputSize - filterSize + 2 * padding + stride) / stride;
+                VLOG(3) << " batchSize=" << batchSize
+                        << " inputChannels=" << inputChannels
+                        << " inputHeight=" << inputSize
+                        << " inputWidth=" << inputSize
+                        << " outputChannels=" << outputChannels
+                        << " filterHeight=" << filterSize
+                        << " filterWidth=" << filterSize
+                        << " outputHeight=" << outputSize
+                        << " outputWidth=" << outputSize << " stride=" << stride
+                        << " padding=" << padding;
+
+                std::vector<size_t> paddings = {padding, padding};
+                std::vector<size_t> strides = {stride, stride};
+                size_t groups = inputChannels;
+                Compare2Function<DType1, DType2> test(
+                    conv1,
+                    conv2,
+                    FuncConfig()
+                        .set("paddings", paddings)
+                        .set("strides", strides)
+                        .set("groups", groups)
+                        .set("algo", algo));
+
+                TensorShape input{
+                    batchSize, inputChannels, inputSize, inputSize};
+                TensorShape filter{inputChannels, 1, 1, filterSize, filterSize};
+                TensorShape output{
+                    batchSize, outputChannels, outputSize, outputSize};
+
+                if (type == kForwardTest) {
+                  test.addInputs(BufferArg(VALUE_TYPE_FLOAT, input));
+                  test.addInputs(BufferArg(VALUE_TYPE_FLOAT, filter));
+                  test.addOutputs(BufferArg(VALUE_TYPE_FLOAT, output));
+                  test.run();
+                } else if (type == kBackwardInputTest) {
+                  test.addInputs(BufferArg(VALUE_TYPE_FLOAT, output));
+                  test.addInputs(BufferArg(VALUE_TYPE_FLOAT, filter));
+                  test.addOutputs(BufferArg(VALUE_TYPE_FLOAT, input), ADD_TO);
+                  test.run();
+                } else if (type == kBackwardFilterTest) {
+                  test.addInputs(BufferArg(VALUE_TYPE_FLOAT, output));
+                  test.addInputs(BufferArg(VALUE_TYPE_FLOAT, input));
+                  test.addOutputs(BufferArg(VALUE_TYPE_FLOAT, filter));
+                  test.run();
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+};
+
+// Mainly used to test cases where the height and width (input, filter)
+// are not equal.
+template <DeviceType DType1, DeviceType DType2>
+class DepthwiseConvolutionTest2 {
+public:
+  DepthwiseConvolutionTest2(const std::string& conv1,
+                            const std::string& conv2,
+                            TestType type,
+                            std::string algo = "auto") {
+    for (size_t batchSize : {16}) {
+      for (size_t inputHeight : {7, 31}) {
+        for (size_t inputWidth : {10, 54}) {
+          for (size_t filterHeight : {1, 5}) {
+            for (size_t filterWidth : {3, 7}) {
+              for (size_t inputChannels : {32}) {
+                size_t outputChannels = inputChannels;
+                size_t stride = 1;
+                size_t padding = 0;
+                size_t outputHeight =
+                    (inputHeight - filterHeight + 2 * padding + stride) /
+                    stride;
+                size_t outputWidth =
+                    (inputWidth - filterWidth + 2 * padding + stride) / stride;
+                VLOG(3) << " batchSize=" << batchSize
+                        << " inputChannels=" << inputChannels
+                        << " inputHeight=" << inputHeight
+                        << " inputWidth=" << inputWidth
+                        << " outputChannels=" << outputChannels
+                        << " filterHeight=" << filterHeight
+                        << " filterWidth=" << filterWidth
+                        << " outputHeight=" << outputHeight
+                        << " outputWidth=" << outputWidth
+                        << " stride=" << stride << " padding=" << padding;
+
+                std::vector<size_t> paddings = {padding, padding};
+                std::vector<size_t> strides = {stride, stride};
+                size_t groups = inputChannels;
+                Compare2Function<DType1, DType2> test(
+                    conv1,
+                    conv2,
+                    FuncConfig()
+                        .set("paddings", paddings)
+                        .set("strides", strides)
+                        .set("groups", groups)
+                        .set("algo", algo));
+
+                TensorShape input{
+                    batchSize, inputChannels, inputHeight, inputWidth};
+                TensorShape filter{
+                    inputChannels, 1, 1, filterHeight, filterWidth};
+                TensorShape output{
+                    batchSize, outputChannels, outputHeight, outputWidth};
+
+                if (type == kForwardTest) {
+                  test.addInputs(BufferArg(VALUE_TYPE_FLOAT, input));
+                  test.addInputs(BufferArg(VALUE_TYPE_FLOAT, filter));
+                  test.addOutputs(BufferArg(VALUE_TYPE_FLOAT, output));
+                  test.run();
+                } else if (type == kBackwardInputTest) {
+                  test.addInputs(BufferArg(VALUE_TYPE_FLOAT, output));
+                  test.addInputs(BufferArg(VALUE_TYPE_FLOAT, filter));
+                  test.addOutputs(BufferArg(VALUE_TYPE_FLOAT, input), ADD_TO);
+                  test.run();
+                } else if (type == kBackwardFilterTest) {
+                  test.addInputs(BufferArg(VALUE_TYPE_FLOAT, output));
+                  test.addInputs(BufferArg(VALUE_TYPE_FLOAT, input));
+                  test.addOutputs(BufferArg(VALUE_TYPE_FLOAT, filter));
+                  test.run();
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+};
+
+#ifndef PADDLE_ONLY_CPU
+TEST(Forward, GEMM2) {
+  DepthwiseConvolutionTest<DEVICE_TYPE_GPU, DEVICE_TYPE_GPU> test(
+      "DepthwiseConv-GPU", "DepthwiseConv-GPU", kForwardTest);
+  DepthwiseConvolutionTest2<DEVICE_TYPE_CPU, DEVICE_TYPE_GPU> test2(
+      "DepthwiseConv-GPU", "DepthwiseConv-GPU", kForwardTest);
+}
+
+TEST(BackwardInput, GEMM) {
+  DepthwiseConvolutionTest<DEVICE_TYPE_CPU, DEVICE_TYPE_GPU> test(
+      "DepthwiseConvGradInput-GPU",
+      "DepthwiseConvGradInput-GPU",
+      kBackwardInputTest);
+  DepthwiseConvolutionTest2<DEVICE_TYPE_CPU, DEVICE_TYPE_GPU> test2(
+      "DepthwiseConvGradInput-GPU",
+      "DepthwiseConvGradInput-GPU",
+      kBackwardInputTest);
+}
+
+TEST(BackwardFilter, GEMM) {
+  DepthwiseConvolutionTest<DEVICE_TYPE_CPU, DEVICE_TYPE_GPU> test(
+      "DepthwiseConvGradFilter-GPU",
+      "DepthwiseConvGradFilter-GPU",
+      kBackwardFilterTest);
+  DepthwiseConvolutionTest2<DEVICE_TYPE_CPU, DEVICE_TYPE_GPU> test2(
+      "DepthwiseConvGradFilter-GPU",
+      "DepthwiseConvGradFilter-GPU",
+      kBackwardFilterTest);
+}
+#endif
+
+}  // namespace paddle