From 2ffa3a8bf6a7cb0e3d5e1ac211417c234ab04f04 Mon Sep 17 00:00:00 2001
From: xzl <zlx_hg@163.com>
Date: Fri, 2 Feb 2018 18:28:23 +0800
Subject: [PATCH] rename op to depthwise_conv2d, more efficient

---
 paddle/operators/conv_op.cc                   |  8 +-
 paddle/operators/conv_op.cu.cc                |  4 +-
 paddle/operators/math/depthwise_conv.cu       | 79 ++++++-------------
 python/paddle/v2/fluid/layers/nn.py           |  2 +-
 .../paddle/v2/fluid/tests/test_conv2d_op.py   |  4 +-
 5 files changed, 34 insertions(+), 63 deletions(-)

diff --git a/paddle/operators/conv_op.cc b/paddle/operators/conv_op.cc
index d25f3fd1a0f..cef7ddd5fe7 100644
--- a/paddle/operators/conv_op.cc
+++ b/paddle/operators/conv_op.cc
@@ -320,20 +320,20 @@ REGISTER_OP(conv2d, ops::ConvOp, ops::Conv2DOpMaker, conv2d_grad,
             ops::ConvOpGrad);
 
 // depthwise convolution op
-REGISTER_OP(depthwise_conv, ops::ConvOp, ops::Conv2DOpMaker,
-            depthwise_conv_grad, ops::ConvOpGrad);
+REGISTER_OP(depthwise_conv2d, ops::ConvOp, ops::Conv2DOpMaker,
+            depthwise_conv2d_grad, ops::ConvOpGrad);
 REGISTER_OP(conv3d, ops::ConvOp, ops::Conv3DOpMaker, conv3d_grad,
             ops::ConvOpGrad);
 
 // depthwise conv kernel
 // TODO(xingzhaolong): neon kernel for mobile
 REGISTER_OP_CPU_KERNEL(
-    depthwise_conv,
+    depthwise_conv2d,
     ops::GemmConvKernel<paddle::platform::CPUDeviceContext, float>,
     ops::GemmConvKernel<paddle::platform::CPUDeviceContext, double>);
 
 REGISTER_OP_CPU_KERNEL(
-    depthwise_conv_grad,
+    depthwise_conv2d_grad,
     ops::GemmConvGradKernel<paddle::platform::CPUDeviceContext, float>,
     ops::GemmConvGradKernel<paddle::platform::CPUDeviceContext, double>);
 
diff --git a/paddle/operators/conv_op.cu.cc b/paddle/operators/conv_op.cu.cc
index 02a4e52466f..d0bd40ee95d 100644
--- a/paddle/operators/conv_op.cu.cc
+++ b/paddle/operators/conv_op.cu.cc
@@ -17,12 +17,12 @@ limitations under the License. */
 namespace ops = paddle::operators;
 
 REGISTER_OP_CUDA_KERNEL(
-    depthwise_conv,
+    depthwise_conv2d,
     ops::DepthwiseConvKernel<paddle::platform::CUDADeviceContext, float>,
     ops::DepthwiseConvKernel<paddle::platform::CUDADeviceContext, double>);
 
 REGISTER_OP_CUDA_KERNEL(
-    depthwise_conv_grad,
+    depthwise_conv2d_grad,
     ops::DepthwiseConvGradKernel<paddle::platform::CUDADeviceContext, float>,
     ops::DepthwiseConvGradKernel<paddle::platform::CUDADeviceContext, double>);
 
diff --git a/paddle/operators/math/depthwise_conv.cu b/paddle/operators/math/depthwise_conv.cu
index b9b958c92b0..b212e782083 100644
--- a/paddle/operators/math/depthwise_conv.cu
+++ b/paddle/operators/math/depthwise_conv.cu
@@ -42,38 +42,23 @@ __global__ void KernelDepthwiseConv(
     T value = 0;
     const int h_in_start = -padding_height + h_out * stride_height;
     const int w_in_start = -padding_width + w_out * stride_width;
-    const int h_in_end =
-        -padding_height + h_out * stride_height + filter_height - 1;
-    const int w_in_end =
-        -padding_width + w_out * stride_width + filter_width - 1;
+    const int h_in_end = h_in_start + filter_height;
+    const int w_in_end = w_in_start + filter_width;
 
     const int in_offset =
         ((batch * input_channels + c_in) * input_height) * input_width;
 
-    if ((h_in_start >= 0) && (h_in_end < input_height) && (w_in_start >= 0) &&
-        (w_in_end < input_width)) {
-      for (int kh = 0; kh < filter_height; ++kh) {
-        for (int kw = 0; kw < filter_width; ++kw) {
-          const int h_in = h_in_start + kh;
-          const int w_in = w_in_start + kw;
-          const int offset = in_offset + h_in * input_width + w_in;
-
-          value += (*weight) * input_data[offset];
-          ++weight;
-        }
-      }
-    } else {
-      for (int kh = 0; kh < filter_height; ++kh) {
-        for (int kw = 0; kw < filter_width; ++kw) {
-          const int h_in = h_in_start + kh;
-          const int w_in = w_in_start + kw;
-          if ((h_in >= 0) && (h_in < input_height) && (w_in >= 0) &&
-              (w_in < input_width)) {
-            const int offset = in_offset + h_in * input_width + w_in;
-            value += (*weight) * input_data[offset];
-          }
-          ++weight;
-        }
+    const int h_end = h_in_end < input_height ? h_in_end : input_height;
+    const int w_end = w_in_end < input_width ? w_in_end : input_width;
+    const int h_start = h_in_start > 0 ? h_in_start : 0;
+    const int w_start = w_in_start > 0 ? w_in_start : 0;
+
+    for (int h_in = h_start; h_in < h_end; h_in++) {
+      for (int w_in = w_start; w_in < w_end; w_in++) {
+        const int offset = in_offset + h_in * input_width + w_in;
+        value +=
+            weight[(h_in - h_in_start) * filter_width + (w_in - w_in_start)] *
+            input_data[offset];
       }
     }
     output_data[index] = value;
@@ -162,32 +147,18 @@ __global__ void KernelDepthwiseConvFilterGrad(
         (batch * input_channels + c_in) * input_height * input_width;
 
     T* addr_offset = filter_grad_data + c_out * filter_height * filter_width;
-
-    if ((h_in_start >= 0) && (h_in_end < input_height) && (w_in_start >= 0) &&
-        (w_in_end < input_width)) {
-      for (int kw = 0; kw < filter_width; kw++) {
-        for (int kh = 0; kh < filter_height; kh++) {
-          const int h_in = h_in_start + kh;
-          const int w_in = w_in_start + kw;
-          const int offset = in_offset + h_in * input_width + w_in;
-          const T diff_temp = output_grad_data[index] * input_data[offset];
-          T* addr = addr_offset + kh * filter_width + kw;
-          paddle::platform::CudaAtomicAdd(addr, diff_temp);
-        }
-      }
-    } else {
-      for (int kw = 0; kw < filter_width; kw++) {
-        for (int kh = 0; kh < filter_height; kh++) {
-          const int h_in = h_in_start + kh;
-          const int w_in = w_in_start + kw;
-          if ((h_in >= 0) && (h_in < input_height) && (w_in >= 0) &&
-              (w_in < input_width)) {
-            const int offset = in_offset + h_in * input_width + w_in;
-            const T diff_temp = output_grad_data[index] * input_data[offset];
-            T* addr = addr_offset + kh * filter_width + kw;
-            paddle::platform::CudaAtomicAdd(addr, diff_temp);
-          }
-        }
+    const int h_end = h_in_end < input_height ? h_in_end : input_height;
+    const int w_end = w_in_end < input_width ? w_in_end : input_width;
+    const int h_start = h_in_start > 0 ? h_in_start : 0;
+    const int w_start = w_in_start > 0 ? w_in_start : 0;
+
+    for (int h_in = h_start; h_in < h_end; h_in++) {
+      for (int w_in = w_start; w_in < w_end; w_in++) {
+        const int offset = in_offset + h_in * input_width + w_in;
+        const T diff_temp = output_grad_data[index] * input_data[offset];
+        T* addr = addr_offset + (h_in - h_in_start) * filter_width +
+                  (w_in - w_in_start);
+        paddle::platform::CudaAtomicAdd(addr, diff_temp);
       }
     }
   }
diff --git a/python/paddle/v2/fluid/layers/nn.py b/python/paddle/v2/fluid/layers/nn.py
index 4be6ae8ed69..aaf096f0dd5 100644
--- a/python/paddle/v2/fluid/layers/nn.py
+++ b/python/paddle/v2/fluid/layers/nn.py
@@ -1237,7 +1237,7 @@ def conv2d(input,
     l_type = 'conv2d'
     if (num_channels == groups and num_filters % num_channels == 0 and
             not use_cudnn):
-        l_type = 'depthwise_conv'
+        l_type = 'depthwise_conv2d'
 
     helper = LayerHelper(l_type, **locals())
     dtype = helper.input_dtype()
diff --git a/python/paddle/v2/fluid/tests/test_conv2d_op.py b/python/paddle/v2/fluid/tests/test_conv2d_op.py
index a034d0ab918..7512ea333e3 100644
--- a/python/paddle/v2/fluid/tests/test_conv2d_op.py
+++ b/python/paddle/v2/fluid/tests/test_conv2d_op.py
@@ -250,7 +250,7 @@ class TestDepthwiseConv(TestConv2dOp):
         assert np.mod(self.input_size[1], self.groups) == 0
         f_c = self.input_size[1] / self.groups
         self.filter_size = [6, f_c, 3, 3]
-        self.op_type = "depthwise_conv"
+        self.op_type = "depthwise_conv2d"
 
 
 class TestDepthwiseConv2(TestConv2dOp):
@@ -262,7 +262,7 @@ class TestDepthwiseConv2(TestConv2dOp):
         assert np.mod(self.input_size[1], self.groups) == 0
         f_c = self.input_size[1] / self.groups
         self.filter_size = [6, f_c, 3, 3]
-        self.op_type = "depthwise_conv"
+        self.op_type = "depthwise_conv2d"
 
 
 #  cudnn v5 does not support dilation conv.
-- 
GitLab