From 6e17babe49a7fdeb4f345c83d347f217d05e7e77 Mon Sep 17 00:00:00 2001 From: xzl Date: Tue, 30 Jan 2018 19:05:53 +0800 Subject: [PATCH] More efficient, add check on python side --- paddle/operators/CMakeLists.txt | 1 - paddle/operators/math/depthwise_conv.cu | 52 ++++++++++++------------- python/paddle/v2/fluid/layers/nn.py | 3 +- 3 files changed, 26 insertions(+), 30 deletions(-) diff --git a/paddle/operators/CMakeLists.txt b/paddle/operators/CMakeLists.txt index 8b442af45b6..f7d600414fc 100644 --- a/paddle/operators/CMakeLists.txt +++ b/paddle/operators/CMakeLists.txt @@ -159,7 +159,6 @@ if (WITH_GPU) op_library(conv_op SRCS conv_op.cc conv_op.cu.cc conv_cudnn_op.cu.cc DEPS vol2col depthwise_conv) -# op_library(conv_op SRCS conv_op.cc conv_op.cu.cc conv_cudnn_op.cu.cc DEPS vol2col) op_library(edit_distance_op SRCS edit_distance_op.cc edit_distance_op.cu DEPS math_function) op_library(pool_op SRCS pool_op.cc pool_op.cu.cc pool_cudnn_op.cu.cc DEPS pooling) op_library(conv_transpose_op SRCS conv_transpose_op.cc conv_transpose_op.cu.cc diff --git a/paddle/operators/math/depthwise_conv.cu b/paddle/operators/math/depthwise_conv.cu index 23e26e88275..4aa38151e66 100644 --- a/paddle/operators/math/depthwise_conv.cu +++ b/paddle/operators/math/depthwise_conv.cu @@ -46,16 +46,18 @@ __global__ void KernelDepthwiseConv( -padding_height + h_out * stride_height + filter_height - 1; const int w_in_end = -padding_width + w_out * stride_width + filter_width - 1; + + const int in_offset = + ((batch * input_channels + c_in) * input_height) * input_width; + if ((h_in_start >= 0) && (h_in_end < input_height) && (w_in_start >= 0) && (w_in_end < input_width)) { for (int kh = 0; kh < filter_height; ++kh) { for (int kw = 0; kw < filter_width; ++kw) { - const int h_in = -padding_height + h_out * stride_height + kh; - const int w_in = -padding_width + w_out * stride_width + kw; - const int offset = - ((batch * input_channels + c_in) * input_height + h_in) * - input_width + - w_in; + const int h_in = h_in_start + kh; + const int w_in = w_in_start + kw; + const int offset = in_offset + h_in * input_width + w_in; + value += (*weight) * input_data[offset]; ++weight; } @@ -63,14 +65,11 @@ __global__ void KernelDepthwiseConv( } else { for (int kh = 0; kh < filter_height; ++kh) { for (int kw = 0; kw < filter_width; ++kw) { - const int h_in = -padding_height + h_out * stride_height + kh; - const int w_in = -padding_width + w_out * stride_width + kw; + const int h_in = h_in_start + kh; + const int w_in = w_in_start + kw; if ((h_in >= 0) && (h_in < input_height) && (w_in >= 0) && (w_in < input_width)) { - const int offset = - ((batch * input_channels + c_in) * input_height + h_in) * - input_width + - w_in; + const int offset = in_offset + h_in * input_width + w_in; value += (*weight) * input_data[offset]; } ++weight; @@ -159,36 +158,33 @@ __global__ void KernelDepthwiseConvFilterGrad( const int h_in_end = -padding_height + h_out * stride_height + filter_height; const int w_in_end = -padding_width + w_out * stride_width + filter_width; + const int in_offset = + (batch * input_channels + c_in) * input_height * input_width; + + T* addr_offset = filter_grad_data + c_out * filter_height * filter_width; + if ((h_in_start >= 0) && (h_in_end < input_height) && (w_in_start >= 0) && (w_in_end < input_width)) { for (int kw = 0; kw < filter_width; kw++) { for (int kh = 0; kh < filter_height; kh++) { - const int h_in = -padding_height + h_out * stride_height + kh; - const int w_in = -padding_width + w_out * stride_width + kw; - const int offset = - ((batch * input_channels + c_in) * input_height + h_in) * - input_width + - w_in; + const int h_in = h_in_start + kh; + const int w_in = w_in_start + kw; + const int offset = in_offset + h_in * input_width + w_in; const T diff_temp = output_grad_data[index] * input_data[offset]; - T* addr = filter_grad_data + c_out * filter_height * filter_width + - kh * filter_width + kw; + T* addr = addr_offset + kh * filter_width + kw; paddle::platform::CudaAtomicAdd(addr, diff_temp); } } } else { for (int kw = 0; kw < filter_width; kw++) { for (int kh = 0; kh < filter_height; kh++) { - const int h_in = -padding_height + h_out * stride_height + kh; - const int w_in = -padding_width + w_out * stride_width + kw; + const int h_in = h_in_start + kh; + const int w_in = w_in_start + kw; if ((h_in >= 0) && (h_in < input_height) && (w_in >= 0) && (w_in < input_width)) { - const int offset = - ((batch * input_channels + c_in) * input_height + h_in) * - input_width + - w_in; + const int offset = in_offset + h_in * input_width + w_in; const T diff_temp = output_grad_data[index] * input_data[offset]; - T* addr = filter_grad_data + c_out * filter_height * filter_width + - kh * filter_width + kw; + T* addr = addr_offset + kh * filter_width + kw; paddle::platform::CudaAtomicAdd(addr, diff_temp); } } diff --git a/python/paddle/v2/fluid/layers/nn.py b/python/paddle/v2/fluid/layers/nn.py index 40c7ec5866b..a047cc4eecb 100644 --- a/python/paddle/v2/fluid/layers/nn.py +++ b/python/paddle/v2/fluid/layers/nn.py @@ -1013,7 +1013,8 @@ def conv2d(input, num_channels = input.shape[1] l_type = 'conv2d' - if num_channels == groups and not use_cudnn: + if (num_channels == groups and num_filters % num_channels == 0 and + not use_cudnn): l_type = 'depthwise_conv' helper = LayerHelper(l_type, **locals()) -- GitLab