rename op to depthwise_conv2d, more efficient

2ffa3a8b · xzl · fc9b2b9a · 2ffa3a8b · 2ffa3a8b · 2ffa3a8b
5 changed file
--- a/paddle/operators/conv_op.cc
+++ b/paddle/operators/conv_op.cc
@@ -320,20 +320,20 @@ REGISTER_OP(conv2d, ops::ConvOp, ops::Conv2DOpMaker, conv2d_grad,
            ops::ConvOpGrad);
 // depthwise convolution op
-REGISTER_OP(depthwise_conv, ops::ConvOp, ops::Conv2DOpMaker,
+REGISTER_OP(depthwise_conv2d, ops::ConvOp, ops::Conv2DOpMaker,
-            depthwise_conv_grad, ops::ConvOpGrad);
+            depthwise_conv2d_grad, ops::ConvOpGrad);
 REGISTER_OP(conv3d, ops::ConvOp, ops::Conv3DOpMaker, conv3d_grad,
            ops::ConvOpGrad);
 // depthwise conv kernel
 // TODO(xingzhaolong): neon kernel for mobile
 REGISTER_OP_CPU_KERNEL(
-    depthwise_conv,
+    depthwise_conv2d,
    ops::GemmConvKernel<paddle::platform::CPUDeviceContext, float>,
    ops::GemmConvKernel<paddle::platform::CPUDeviceContext, double>);
 REGISTER_OP_CPU_KERNEL(
-    depthwise_conv_grad,
+    depthwise_conv2d_grad,
    ops::GemmConvGradKernel<paddle::platform::CPUDeviceContext, float>,
    ops::GemmConvGradKernel<paddle::platform::CPUDeviceContext, double>);

--- a/paddle/operators/conv_op.cu.cc
+++ b/paddle/operators/conv_op.cu.cc
@@ -17,12 +17,12 @@ limitations under the License. */
 namespace ops = paddle::operators;
 REGISTER_OP_CUDA_KERNEL(
-    depthwise_conv,
+    depthwise_conv2d,
    ops::DepthwiseConvKernel<paddle::platform::CUDADeviceContext, float>,
    ops::DepthwiseConvKernel<paddle::platform::CUDADeviceContext, double>);
 REGISTER_OP_CUDA_KERNEL(
-    depthwise_conv_grad,
+    depthwise_conv2d_grad,
    ops::DepthwiseConvGradKernel<paddle::platform::CUDADeviceContext, float>,
    ops::DepthwiseConvGradKernel<paddle::platform::CUDADeviceContext, double>);

--- a/paddle/operators/math/depthwise_conv.cu
+++ b/paddle/operators/math/depthwise_conv.cu
@@ -42,38 +42,23 @@ __global__ void KernelDepthwiseConv(
    T value = 0;
    const int h_in_start = -padding_height + h_out * stride_height;
    const int w_in_start = -padding_width + w_out * stride_width;
-    const int h_in_end =
+    const int h_in_end = h_in_start + filter_height;
-        -padding_height + h_out * stride_height + filter_height - 1;
+    const int w_in_end = w_in_start + filter_width;
-    const int w_in_end =
-        -padding_width + w_out * stride_width + filter_width - 1;
    const int in_offset =
        ((batch * input_channels + c_in) * input_height) * input_width;
-    if ((h_in_start >= 0) && (h_in_end < input_height) && (w_in_start >= 0) &&
+    const int h_end = h_in_end < input_height ? h_in_end : input_height;
-        (w_in_end < input_width)) {
+    const int w_end = w_in_end < input_width ? w_in_end : input_width;
-      for (int kh = 0; kh < filter_height; ++kh) {
+    const int h_start = h_in_start > 0 ? h_in_start : 0;
-        for (int kw = 0; kw < filter_width; ++kw) {
+    const int w_start = w_in_start > 0 ? w_in_start : 0;
-          const int h_in = h_in_start + kh;
-          const int w_in = w_in_start + kw;
+    for (int h_in = h_start; h_in < h_end; h_in++) {
-          const int offset = in_offset + h_in * input_width + w_in;
+      for (int w_in = w_start; w_in < w_end; w_in++) {
+        const int offset = in_offset + h_in * input_width + w_in;
-          value += (*weight) * input_data[offset];
+        value +=
-          ++weight;
+            weight[(h_in - h_in_start) * filter_width + (w_in - w_in_start)] *
-        }
+            input_data[offset];
-      }
-    } else {
-      for (int kh = 0; kh < filter_height; ++kh) {
-        for (int kw = 0; kw < filter_width; ++kw) {
-          const int h_in = h_in_start + kh;
-          const int w_in = w_in_start + kw;
-          if ((h_in >= 0) && (h_in < input_height) && (w_in >= 0) &&
-              (w_in < input_width)) {
-            const int offset = in_offset + h_in * input_width + w_in;
-            value += (*weight) * input_data[offset];
-          }
-          ++weight;
-        }
      }
    }
    output_data[index] = value;
@@ -162,32 +147,18 @@ __global__ void KernelDepthwiseConvFilterGrad(
        (batch * input_channels + c_in) * input_height * input_width;
    T* addr_offset = filter_grad_data + c_out * filter_height * filter_width;
+    const int h_end = h_in_end < input_height ? h_in_end : input_height;
-    if ((h_in_start >= 0) && (h_in_end < input_height) && (w_in_start >= 0) &&
+    const int w_end = w_in_end < input_width ? w_in_end : input_width;
-        (w_in_end < input_width)) {
+    const int h_start = h_in_start > 0 ? h_in_start : 0;
-      for (int kw = 0; kw < filter_width; kw++) {
+    const int w_start = w_in_start > 0 ? w_in_start : 0;
-        for (int kh = 0; kh < filter_height; kh++) {
-          const int h_in = h_in_start + kh;
+    for (int h_in = h_start; h_in < h_end; h_in++) {
-          const int w_in = w_in_start + kw;
+      for (int w_in = w_start; w_in < w_end; w_in++) {
-          const int offset = in_offset + h_in * input_width + w_in;
+        const int offset = in_offset + h_in * input_width + w_in;
-          const T diff_temp = output_grad_data[index] * input_data[offset];
+        const T diff_temp = output_grad_data[index] * input_data[offset];
-          T* addr = addr_offset + kh * filter_width + kw;
+        T* addr = addr_offset + (h_in - h_in_start) * filter_width +
-          paddle::platform::CudaAtomicAdd(addr, diff_temp);
+                  (w_in - w_in_start);
-        }
+        paddle::platform::CudaAtomicAdd(addr, diff_temp);
-      }
-    } else {
-      for (int kw = 0; kw < filter_width; kw++) {
-        for (int kh = 0; kh < filter_height; kh++) {
-          const int h_in = h_in_start + kh;
-          const int w_in = w_in_start + kw;
-          if ((h_in >= 0) && (h_in < input_height) && (w_in >= 0) &&
-              (w_in < input_width)) {
-            const int offset = in_offset + h_in * input_width + w_in;
-            const T diff_temp = output_grad_data[index] * input_data[offset];
-            T* addr = addr_offset + kh * filter_width + kw;
-            paddle::platform::CudaAtomicAdd(addr, diff_temp);
-          }
-        }
      }
    }
  }

--- a/python/paddle/v2/fluid/layers/nn.py
+++ b/python/paddle/v2/fluid/layers/nn.py
@@ -1237,7 +1237,7 @@ def conv2d(input,
    l_type = 'conv2d'
    if (num_channels == groups and num_filters % num_channels == 0 and
            not use_cudnn):
-        l_type = 'depthwise_conv'
+        l_type = 'depthwise_conv2d'
    helper = LayerHelper(l_type, **locals())
    dtype = helper.input_dtype()

--- a/python/paddle/v2/fluid/tests/test_conv2d_op.py
+++ b/python/paddle/v2/fluid/tests/test_conv2d_op.py
@@ -250,7 +250,7 @@ class TestDepthwiseConv(TestConv2dOp):
        assert np.mod(self.input_size[1], self.groups) == 0
        f_c = self.input_size[1] / self.groups
        self.filter_size = [6, f_c, 3, 3]
-        self.op_type = "depthwise_conv"
+        self.op_type = "depthwise_conv2d"
 class TestDepthwiseConv2(TestConv2dOp):
@@ -262,7 +262,7 @@ class TestDepthwiseConv2(TestConv2dOp):
        assert np.mod(self.input_size[1], self.groups) == 0
        f_c = self.input_size[1] / self.groups
        self.filter_size = [6, f_c, 3, 3]
-        self.op_type = "depthwise_conv"
+        self.op_type = "depthwise_conv2d"
 #  cudnn v5 does not support dilation conv.