More efficient, add check on python side

6e17babe · xzl · b5ea0483 · 6e17babe · 6e17babe · 6e17babe
3 changed file
--- a/paddle/operators/CMakeLists.txt
+++ b/paddle/operators/CMakeLists.txt
@@ -159,7 +159,6 @@ if (WITH_GPU)
 op_library(conv_op SRCS conv_op.cc conv_op.cu.cc conv_cudnn_op.cu.cc DEPS
    vol2col depthwise_conv)

-# op_library(conv_op SRCS conv_op.cc conv_op.cu.cc conv_cudnn_op.cu.cc DEPS vol2col)
 op_library(edit_distance_op SRCS edit_distance_op.cc edit_distance_op.cu DEPS math_function)
 op_library(pool_op SRCS pool_op.cc pool_op.cu.cc pool_cudnn_op.cu.cc DEPS pooling)
 op_library(conv_transpose_op SRCS conv_transpose_op.cc conv_transpose_op.cu.cc

--- a/paddle/operators/math/depthwise_conv.cu
+++ b/paddle/operators/math/depthwise_conv.cu
@@ -46,16 +46,18 @@ __global__ void KernelDepthwiseConv(
        -padding_height + h_out * stride_height + filter_height - 1;
    const int w_in_end =
        -padding_width + w_out * stride_width + filter_width - 1;
+
+    const int in_offset =
+        ((batch * input_channels + c_in) * input_height) * input_width;
+
    if ((h_in_start >= 0) && (h_in_end < input_height) && (w_in_start >= 0) &&
        (w_in_end < input_width)) {
      for (int kh = 0; kh < filter_height; ++kh) {
        for (int kw = 0; kw < filter_width; ++kw) {
-          const int h_in = -padding_height + h_out * stride_height + kh;
-          const int w_in = -padding_width + w_out * stride_width + kw;
-          const int offset =
-              ((batch * input_channels + c_in) * input_height + h_in) *
-                  input_width +
-              w_in;
+          const int h_in = h_in_start + kh;
+          const int w_in = w_in_start + kw;
+          const int offset = in_offset + h_in * input_width + w_in;
+
          value += (*weight) * input_data[offset];
          ++weight;
        }
@@ -63,14 +65,11 @@ __global__ void KernelDepthwiseConv(
    } else {
      for (int kh = 0; kh < filter_height; ++kh) {
        for (int kw = 0; kw < filter_width; ++kw) {
-          const int h_in = -padding_height + h_out * stride_height + kh;
-          const int w_in = -padding_width + w_out * stride_width + kw;
+          const int h_in = h_in_start + kh;
+          const int w_in = w_in_start + kw;
          if ((h_in >= 0) && (h_in < input_height) && (w_in >= 0) &&
              (w_in < input_width)) {
-            const int offset =
-                ((batch * input_channels + c_in) * input_height + h_in) *
-                    input_width +
-                w_in;
+            const int offset = in_offset + h_in * input_width + w_in;
            value += (*weight) * input_data[offset];
          }
          ++weight;
@@ -159,36 +158,33 @@ __global__ void KernelDepthwiseConvFilterGrad(
    const int h_in_end =
        -padding_height + h_out * stride_height + filter_height;
    const int w_in_end = -padding_width + w_out * stride_width + filter_width;
+    const int in_offset =
+        (batch * input_channels + c_in) * input_height * input_width;
+
+    T* addr_offset = filter_grad_data + c_out * filter_height * filter_width;
+
    if ((h_in_start >= 0) && (h_in_end < input_height) && (w_in_start >= 0) &&
        (w_in_end < input_width)) {
      for (int kw = 0; kw < filter_width; kw++) {
        for (int kh = 0; kh < filter_height; kh++) {
-          const int h_in = -padding_height + h_out * stride_height + kh;
-          const int w_in = -padding_width + w_out * stride_width + kw;
-          const int offset =
-              ((batch * input_channels + c_in) * input_height + h_in) *
-                  input_width +
-              w_in;
+          const int h_in = h_in_start + kh;
+          const int w_in = w_in_start + kw;
+          const int offset = in_offset + h_in * input_width + w_in;
          const T diff_temp = output_grad_data[index] * input_data[offset];
-          T* addr = filter_grad_data + c_out * filter_height * filter_width +
-                    kh * filter_width + kw;
+          T* addr = addr_offset + kh * filter_width + kw;
          paddle::platform::CudaAtomicAdd(addr, diff_temp);
        }
      }
    } else {
      for (int kw = 0; kw < filter_width; kw++) {
        for (int kh = 0; kh < filter_height; kh++) {
-          const int h_in = -padding_height + h_out * stride_height + kh;
-          const int w_in = -padding_width + w_out * stride_width + kw;
+          const int h_in = h_in_start + kh;
+          const int w_in = w_in_start + kw;
          if ((h_in >= 0) && (h_in < input_height) && (w_in >= 0) &&
              (w_in < input_width)) {
-            const int offset =
-                ((batch * input_channels + c_in) * input_height + h_in) *
-                    input_width +
-                w_in;
+            const int offset = in_offset + h_in * input_width + w_in;
            const T diff_temp = output_grad_data[index] * input_data[offset];
-            T* addr = filter_grad_data + c_out * filter_height * filter_width +
-                      kh * filter_width + kw;
+            T* addr = addr_offset + kh * filter_width + kw;
            paddle::platform::CudaAtomicAdd(addr, diff_temp);
          }
        }

--- a/python/paddle/v2/fluid/layers/nn.py
+++ b/python/paddle/v2/fluid/layers/nn.py
@@ -1013,7 +1013,8 @@ def conv2d(input,
    num_channels = input.shape[1]

    l_type = 'conv2d'
-    if num_channels == groups and not use_cudnn:
+    if (num_channels == groups and num_filters % num_channels == 0 and
+            not use_cudnn):
        l_type = 'depthwise_conv'

    helper = LayerHelper(l_type, **locals())