diff --git a/paddle/operators/conv_op.cc b/paddle/operators/conv_op.cc
index d25f3fd1a0f1043bd663713fc176b0928392da60..cef7ddd5fe7e12a374fb9cc79211bd2eb97c6c52 100644
--- a/paddle/operators/conv_op.cc
+++ b/paddle/operators/conv_op.cc
@@ -320,20 +320,20 @@ REGISTER_OP(conv2d, ops::ConvOp, ops::Conv2DOpMaker, conv2d_grad,
             ops::ConvOpGrad);
 
 // depthwise convolution op
-REGISTER_OP(depthwise_conv, ops::ConvOp, ops::Conv2DOpMaker,
-            depthwise_conv_grad, ops::ConvOpGrad);
+REGISTER_OP(depthwise_conv2d, ops::ConvOp, ops::Conv2DOpMaker,
+            depthwise_conv2d_grad, ops::ConvOpGrad);
 REGISTER_OP(conv3d, ops::ConvOp, ops::Conv3DOpMaker, conv3d_grad,
             ops::ConvOpGrad);
 
 // depthwise conv kernel
 // TODO(xingzhaolong): neon kernel for mobile
 REGISTER_OP_CPU_KERNEL(
-    depthwise_conv,
+    depthwise_conv2d,
     ops::GemmConvKernel<paddle::platform::CPUDeviceContext, float>,
     ops::GemmConvKernel<paddle::platform::CPUDeviceContext, double>);
 
 REGISTER_OP_CPU_KERNEL(
-    depthwise_conv_grad,
+    depthwise_conv2d_grad,
     ops::GemmConvGradKernel<paddle::platform::CPUDeviceContext, float>,
     ops::GemmConvGradKernel<paddle::platform::CPUDeviceContext, double>);
 
diff --git a/paddle/operators/conv_op.cu.cc b/paddle/operators/conv_op.cu.cc
index 02a4e52466fdd939593690c8077729d673ecfe51..d0bd40ee95dab3b2589742b8a0c3a5de7918b5b9 100644
--- a/paddle/operators/conv_op.cu.cc
+++ b/paddle/operators/conv_op.cu.cc
@@ -17,12 +17,12 @@ limitations under the License. */
 namespace ops = paddle::operators;
 
 REGISTER_OP_CUDA_KERNEL(
-    depthwise_conv,
+    depthwise_conv2d,
     ops::DepthwiseConvKernel<paddle::platform::CUDADeviceContext, float>,
     ops::DepthwiseConvKernel<paddle::platform::CUDADeviceContext, double>);
 
 REGISTER_OP_CUDA_KERNEL(
-    depthwise_conv_grad,
+    depthwise_conv2d_grad,
     ops::DepthwiseConvGradKernel<paddle::platform::CUDADeviceContext, float>,
     ops::DepthwiseConvGradKernel<paddle::platform::CUDADeviceContext, double>);
 
diff --git a/paddle/operators/math/depthwise_conv.cu b/paddle/operators/math/depthwise_conv.cu
index b9b958c92b0346c8fa320ffd45f77fa949b72716..b212e78208355866516211d276cb8046623babd7 100644
--- a/paddle/operators/math/depthwise_conv.cu
+++ b/paddle/operators/math/depthwise_conv.cu
@@ -42,38 +42,23 @@ __global__ void KernelDepthwiseConv(
     T value = 0;
     const int h_in_start = -padding_height + h_out * stride_height;
     const int w_in_start = -padding_width + w_out * stride_width;
-    const int h_in_end =
-        -padding_height + h_out * stride_height + filter_height - 1;
-    const int w_in_end =
-        -padding_width + w_out * stride_width + filter_width - 1;
+    const int h_in_end = h_in_start + filter_height;
+    const int w_in_end = w_in_start + filter_width;
 
     const int in_offset =
         ((batch * input_channels + c_in) * input_height) * input_width;
 
-    if ((h_in_start >= 0) && (h_in_end < input_height) && (w_in_start >= 0) &&
-        (w_in_end < input_width)) {
-      for (int kh = 0; kh < filter_height; ++kh) {
-        for (int kw = 0; kw < filter_width; ++kw) {
-          const int h_in = h_in_start + kh;
-          const int w_in = w_in_start + kw;
-          const int offset = in_offset + h_in * input_width + w_in;
-
-          value += (*weight) * input_data[offset];
-          ++weight;
-        }
-      }
-    } else {
-      for (int kh = 0; kh < filter_height; ++kh) {
-        for (int kw = 0; kw < filter_width; ++kw) {
-          const int h_in = h_in_start + kh;
-          const int w_in = w_in_start + kw;
-          if ((h_in >= 0) && (h_in < input_height) && (w_in >= 0) &&
-              (w_in < input_width)) {
-            const int offset = in_offset + h_in * input_width + w_in;
-            value += (*weight) * input_data[offset];
-          }
-          ++weight;
-        }
+    const int h_end = h_in_end < input_height ? h_in_end : input_height;
+    const int w_end = w_in_end < input_width ? w_in_end : input_width;
+    const int h_start = h_in_start > 0 ? h_in_start : 0;
+    const int w_start = w_in_start > 0 ? w_in_start : 0;
+
+    for (int h_in = h_start; h_in < h_end; h_in++) {
+      for (int w_in = w_start; w_in < w_end; w_in++) {
+        const int offset = in_offset + h_in * input_width + w_in;
+        value +=
+            weight[(h_in - h_in_start) * filter_width + (w_in - w_in_start)] *
+            input_data[offset];
       }
     }
     output_data[index] = value;
@@ -162,32 +147,18 @@ __global__ void KernelDepthwiseConvFilterGrad(
         (batch * input_channels + c_in) * input_height * input_width;
 
     T* addr_offset = filter_grad_data + c_out * filter_height * filter_width;
-
-    if ((h_in_start >= 0) && (h_in_end < input_height) && (w_in_start >= 0) &&
-        (w_in_end < input_width)) {
-      for (int kw = 0; kw < filter_width; kw++) {
-        for (int kh = 0; kh < filter_height; kh++) {
-          const int h_in = h_in_start + kh;
-          const int w_in = w_in_start + kw;
-          const int offset = in_offset + h_in * input_width + w_in;
-          const T diff_temp = output_grad_data[index] * input_data[offset];
-          T* addr = addr_offset + kh * filter_width + kw;
-          paddle::platform::CudaAtomicAdd(addr, diff_temp);
-        }
-      }
-    } else {
-      for (int kw = 0; kw < filter_width; kw++) {
-        for (int kh = 0; kh < filter_height; kh++) {
-          const int h_in = h_in_start + kh;
-          const int w_in = w_in_start + kw;
-          if ((h_in >= 0) && (h_in < input_height) && (w_in >= 0) &&
-              (w_in < input_width)) {
-            const int offset = in_offset + h_in * input_width + w_in;
-            const T diff_temp = output_grad_data[index] * input_data[offset];
-            T* addr = addr_offset + kh * filter_width + kw;
-            paddle::platform::CudaAtomicAdd(addr, diff_temp);
-          }
-        }
+    const int h_end = h_in_end < input_height ? h_in_end : input_height;
+    const int w_end = w_in_end < input_width ? w_in_end : input_width;
+    const int h_start = h_in_start > 0 ? h_in_start : 0;
+    const int w_start = w_in_start > 0 ? w_in_start : 0;
+
+    for (int h_in = h_start; h_in < h_end; h_in++) {
+      for (int w_in = w_start; w_in < w_end; w_in++) {
+        const int offset = in_offset + h_in * input_width + w_in;
+        const T diff_temp = output_grad_data[index] * input_data[offset];
+        T* addr = addr_offset + (h_in - h_in_start) * filter_width +
+                  (w_in - w_in_start);
+        paddle::platform::CudaAtomicAdd(addr, diff_temp);
       }
     }
   }
diff --git a/python/paddle/v2/fluid/layers/nn.py b/python/paddle/v2/fluid/layers/nn.py
index 4be6ae8ed69c25dbe7c5ecb9affd6ec948362278..aaf096f0dd5ed9b2dc6004d39e337b3efe879da0 100644
--- a/python/paddle/v2/fluid/layers/nn.py
+++ b/python/paddle/v2/fluid/layers/nn.py
@@ -1237,7 +1237,7 @@ def conv2d(input,
     l_type = 'conv2d'
     if (num_channels == groups and num_filters % num_channels == 0 and
             not use_cudnn):
-        l_type = 'depthwise_conv'
+        l_type = 'depthwise_conv2d'
 
     helper = LayerHelper(l_type, **locals())
     dtype = helper.input_dtype()
diff --git a/python/paddle/v2/fluid/tests/test_conv2d_op.py b/python/paddle/v2/fluid/tests/test_conv2d_op.py
index a034d0ab918355859b3a34f9809cb1103b390e0d..7512ea333e37d5f4f0102531d8d13f8c2a744b8d 100644
--- a/python/paddle/v2/fluid/tests/test_conv2d_op.py
+++ b/python/paddle/v2/fluid/tests/test_conv2d_op.py
@@ -250,7 +250,7 @@ class TestDepthwiseConv(TestConv2dOp):
         assert np.mod(self.input_size[1], self.groups) == 0
         f_c = self.input_size[1] / self.groups
         self.filter_size = [6, f_c, 3, 3]
-        self.op_type = "depthwise_conv"
+        self.op_type = "depthwise_conv2d"
 
 
 class TestDepthwiseConv2(TestConv2dOp):
@@ -262,7 +262,7 @@ class TestDepthwiseConv2(TestConv2dOp):
         assert np.mod(self.input_size[1], self.groups) == 0
         f_c = self.input_size[1] / self.groups
         self.filter_size = [6, f_c, 3, 3]
-        self.op_type = "depthwise_conv"
+        self.op_type = "depthwise_conv2d"
 
 
 #  cudnn v5 does not support dilation conv.