From 2ffa3a8bf6a7cb0e3d5e1ac211417c234ab04f04 Mon Sep 17 00:00:00 2001 From: xzl Date: Fri, 2 Feb 2018 18:28:23 +0800 Subject: [PATCH] rename op to depthwise_conv2d, more efficient --- paddle/operators/conv_op.cc | 8 +- paddle/operators/conv_op.cu.cc | 4 +- paddle/operators/math/depthwise_conv.cu | 79 ++++++------------- python/paddle/v2/fluid/layers/nn.py | 2 +- .../paddle/v2/fluid/tests/test_conv2d_op.py | 4 +- 5 files changed, 34 insertions(+), 63 deletions(-) diff --git a/paddle/operators/conv_op.cc b/paddle/operators/conv_op.cc index d25f3fd1a0f..cef7ddd5fe7 100644 --- a/paddle/operators/conv_op.cc +++ b/paddle/operators/conv_op.cc @@ -320,20 +320,20 @@ REGISTER_OP(conv2d, ops::ConvOp, ops::Conv2DOpMaker, conv2d_grad, ops::ConvOpGrad); // depthwise convolution op -REGISTER_OP(depthwise_conv, ops::ConvOp, ops::Conv2DOpMaker, - depthwise_conv_grad, ops::ConvOpGrad); +REGISTER_OP(depthwise_conv2d, ops::ConvOp, ops::Conv2DOpMaker, + depthwise_conv2d_grad, ops::ConvOpGrad); REGISTER_OP(conv3d, ops::ConvOp, ops::Conv3DOpMaker, conv3d_grad, ops::ConvOpGrad); // depthwise conv kernel // TODO(xingzhaolong): neon kernel for mobile REGISTER_OP_CPU_KERNEL( - depthwise_conv, + depthwise_conv2d, ops::GemmConvKernel, ops::GemmConvKernel); REGISTER_OP_CPU_KERNEL( - depthwise_conv_grad, + depthwise_conv2d_grad, ops::GemmConvGradKernel, ops::GemmConvGradKernel); diff --git a/paddle/operators/conv_op.cu.cc b/paddle/operators/conv_op.cu.cc index 02a4e52466f..d0bd40ee95d 100644 --- a/paddle/operators/conv_op.cu.cc +++ b/paddle/operators/conv_op.cu.cc @@ -17,12 +17,12 @@ limitations under the License. */ namespace ops = paddle::operators; REGISTER_OP_CUDA_KERNEL( - depthwise_conv, + depthwise_conv2d, ops::DepthwiseConvKernel, ops::DepthwiseConvKernel); REGISTER_OP_CUDA_KERNEL( - depthwise_conv_grad, + depthwise_conv2d_grad, ops::DepthwiseConvGradKernel, ops::DepthwiseConvGradKernel); diff --git a/paddle/operators/math/depthwise_conv.cu b/paddle/operators/math/depthwise_conv.cu index b9b958c92b0..b212e782083 100644 --- a/paddle/operators/math/depthwise_conv.cu +++ b/paddle/operators/math/depthwise_conv.cu @@ -42,38 +42,23 @@ __global__ void KernelDepthwiseConv( T value = 0; const int h_in_start = -padding_height + h_out * stride_height; const int w_in_start = -padding_width + w_out * stride_width; - const int h_in_end = - -padding_height + h_out * stride_height + filter_height - 1; - const int w_in_end = - -padding_width + w_out * stride_width + filter_width - 1; + const int h_in_end = h_in_start + filter_height; + const int w_in_end = w_in_start + filter_width; const int in_offset = ((batch * input_channels + c_in) * input_height) * input_width; - if ((h_in_start >= 0) && (h_in_end < input_height) && (w_in_start >= 0) && - (w_in_end < input_width)) { - for (int kh = 0; kh < filter_height; ++kh) { - for (int kw = 0; kw < filter_width; ++kw) { - const int h_in = h_in_start + kh; - const int w_in = w_in_start + kw; - const int offset = in_offset + h_in * input_width + w_in; - - value += (*weight) * input_data[offset]; - ++weight; - } - } - } else { - for (int kh = 0; kh < filter_height; ++kh) { - for (int kw = 0; kw < filter_width; ++kw) { - const int h_in = h_in_start + kh; - const int w_in = w_in_start + kw; - if ((h_in >= 0) && (h_in < input_height) && (w_in >= 0) && - (w_in < input_width)) { - const int offset = in_offset + h_in * input_width + w_in; - value += (*weight) * input_data[offset]; - } - ++weight; - } + const int h_end = h_in_end < input_height ? h_in_end : input_height; + const int w_end = w_in_end < input_width ? w_in_end : input_width; + const int h_start = h_in_start > 0 ? h_in_start : 0; + const int w_start = w_in_start > 0 ? w_in_start : 0; + + for (int h_in = h_start; h_in < h_end; h_in++) { + for (int w_in = w_start; w_in < w_end; w_in++) { + const int offset = in_offset + h_in * input_width + w_in; + value += + weight[(h_in - h_in_start) * filter_width + (w_in - w_in_start)] * + input_data[offset]; } } output_data[index] = value; @@ -162,32 +147,18 @@ __global__ void KernelDepthwiseConvFilterGrad( (batch * input_channels + c_in) * input_height * input_width; T* addr_offset = filter_grad_data + c_out * filter_height * filter_width; - - if ((h_in_start >= 0) && (h_in_end < input_height) && (w_in_start >= 0) && - (w_in_end < input_width)) { - for (int kw = 0; kw < filter_width; kw++) { - for (int kh = 0; kh < filter_height; kh++) { - const int h_in = h_in_start + kh; - const int w_in = w_in_start + kw; - const int offset = in_offset + h_in * input_width + w_in; - const T diff_temp = output_grad_data[index] * input_data[offset]; - T* addr = addr_offset + kh * filter_width + kw; - paddle::platform::CudaAtomicAdd(addr, diff_temp); - } - } - } else { - for (int kw = 0; kw < filter_width; kw++) { - for (int kh = 0; kh < filter_height; kh++) { - const int h_in = h_in_start + kh; - const int w_in = w_in_start + kw; - if ((h_in >= 0) && (h_in < input_height) && (w_in >= 0) && - (w_in < input_width)) { - const int offset = in_offset + h_in * input_width + w_in; - const T diff_temp = output_grad_data[index] * input_data[offset]; - T* addr = addr_offset + kh * filter_width + kw; - paddle::platform::CudaAtomicAdd(addr, diff_temp); - } - } + const int h_end = h_in_end < input_height ? h_in_end : input_height; + const int w_end = w_in_end < input_width ? w_in_end : input_width; + const int h_start = h_in_start > 0 ? h_in_start : 0; + const int w_start = w_in_start > 0 ? w_in_start : 0; + + for (int h_in = h_start; h_in < h_end; h_in++) { + for (int w_in = w_start; w_in < w_end; w_in++) { + const int offset = in_offset + h_in * input_width + w_in; + const T diff_temp = output_grad_data[index] * input_data[offset]; + T* addr = addr_offset + (h_in - h_in_start) * filter_width + + (w_in - w_in_start); + paddle::platform::CudaAtomicAdd(addr, diff_temp); } } } diff --git a/python/paddle/v2/fluid/layers/nn.py b/python/paddle/v2/fluid/layers/nn.py index 4be6ae8ed69..aaf096f0dd5 100644 --- a/python/paddle/v2/fluid/layers/nn.py +++ b/python/paddle/v2/fluid/layers/nn.py @@ -1237,7 +1237,7 @@ def conv2d(input, l_type = 'conv2d' if (num_channels == groups and num_filters % num_channels == 0 and not use_cudnn): - l_type = 'depthwise_conv' + l_type = 'depthwise_conv2d' helper = LayerHelper(l_type, **locals()) dtype = helper.input_dtype() diff --git a/python/paddle/v2/fluid/tests/test_conv2d_op.py b/python/paddle/v2/fluid/tests/test_conv2d_op.py index a034d0ab918..7512ea333e3 100644 --- a/python/paddle/v2/fluid/tests/test_conv2d_op.py +++ b/python/paddle/v2/fluid/tests/test_conv2d_op.py @@ -250,7 +250,7 @@ class TestDepthwiseConv(TestConv2dOp): assert np.mod(self.input_size[1], self.groups) == 0 f_c = self.input_size[1] / self.groups self.filter_size = [6, f_c, 3, 3] - self.op_type = "depthwise_conv" + self.op_type = "depthwise_conv2d" class TestDepthwiseConv2(TestConv2dOp): @@ -262,7 +262,7 @@ class TestDepthwiseConv2(TestConv2dOp): assert np.mod(self.input_size[1], self.groups) == 0 f_c = self.input_size[1] / self.groups self.filter_size = [6, f_c, 3, 3] - self.op_type = "depthwise_conv" + self.op_type = "depthwise_conv2d" # cudnn v5 does not support dilation conv. -- GitLab