提交 2ffa3a8b 编写于 作者: X xzl

rename op to depthwise_conv2d, more efficient

上级 fc9b2b9a
...@@ -320,20 +320,20 @@ REGISTER_OP(conv2d, ops::ConvOp, ops::Conv2DOpMaker, conv2d_grad, ...@@ -320,20 +320,20 @@ REGISTER_OP(conv2d, ops::ConvOp, ops::Conv2DOpMaker, conv2d_grad,
ops::ConvOpGrad); ops::ConvOpGrad);
// depthwise convolution op // depthwise convolution op
REGISTER_OP(depthwise_conv, ops::ConvOp, ops::Conv2DOpMaker, REGISTER_OP(depthwise_conv2d, ops::ConvOp, ops::Conv2DOpMaker,
depthwise_conv_grad, ops::ConvOpGrad); depthwise_conv2d_grad, ops::ConvOpGrad);
REGISTER_OP(conv3d, ops::ConvOp, ops::Conv3DOpMaker, conv3d_grad, REGISTER_OP(conv3d, ops::ConvOp, ops::Conv3DOpMaker, conv3d_grad,
ops::ConvOpGrad); ops::ConvOpGrad);
// depthwise conv kernel // depthwise conv kernel
// TODO(xingzhaolong): neon kernel for mobile // TODO(xingzhaolong): neon kernel for mobile
REGISTER_OP_CPU_KERNEL( REGISTER_OP_CPU_KERNEL(
depthwise_conv, depthwise_conv2d,
ops::GemmConvKernel<paddle::platform::CPUDeviceContext, float>, ops::GemmConvKernel<paddle::platform::CPUDeviceContext, float>,
ops::GemmConvKernel<paddle::platform::CPUDeviceContext, double>); ops::GemmConvKernel<paddle::platform::CPUDeviceContext, double>);
REGISTER_OP_CPU_KERNEL( REGISTER_OP_CPU_KERNEL(
depthwise_conv_grad, depthwise_conv2d_grad,
ops::GemmConvGradKernel<paddle::platform::CPUDeviceContext, float>, ops::GemmConvGradKernel<paddle::platform::CPUDeviceContext, float>,
ops::GemmConvGradKernel<paddle::platform::CPUDeviceContext, double>); ops::GemmConvGradKernel<paddle::platform::CPUDeviceContext, double>);
......
...@@ -17,12 +17,12 @@ limitations under the License. */ ...@@ -17,12 +17,12 @@ limitations under the License. */
namespace ops = paddle::operators; namespace ops = paddle::operators;
REGISTER_OP_CUDA_KERNEL( REGISTER_OP_CUDA_KERNEL(
depthwise_conv, depthwise_conv2d,
ops::DepthwiseConvKernel<paddle::platform::CUDADeviceContext, float>, ops::DepthwiseConvKernel<paddle::platform::CUDADeviceContext, float>,
ops::DepthwiseConvKernel<paddle::platform::CUDADeviceContext, double>); ops::DepthwiseConvKernel<paddle::platform::CUDADeviceContext, double>);
REGISTER_OP_CUDA_KERNEL( REGISTER_OP_CUDA_KERNEL(
depthwise_conv_grad, depthwise_conv2d_grad,
ops::DepthwiseConvGradKernel<paddle::platform::CUDADeviceContext, float>, ops::DepthwiseConvGradKernel<paddle::platform::CUDADeviceContext, float>,
ops::DepthwiseConvGradKernel<paddle::platform::CUDADeviceContext, double>); ops::DepthwiseConvGradKernel<paddle::platform::CUDADeviceContext, double>);
......
...@@ -42,38 +42,23 @@ __global__ void KernelDepthwiseConv( ...@@ -42,38 +42,23 @@ __global__ void KernelDepthwiseConv(
T value = 0; T value = 0;
const int h_in_start = -padding_height + h_out * stride_height; const int h_in_start = -padding_height + h_out * stride_height;
const int w_in_start = -padding_width + w_out * stride_width; const int w_in_start = -padding_width + w_out * stride_width;
const int h_in_end = const int h_in_end = h_in_start + filter_height;
-padding_height + h_out * stride_height + filter_height - 1; const int w_in_end = w_in_start + filter_width;
const int w_in_end =
-padding_width + w_out * stride_width + filter_width - 1;
const int in_offset = const int in_offset =
((batch * input_channels + c_in) * input_height) * input_width; ((batch * input_channels + c_in) * input_height) * input_width;
if ((h_in_start >= 0) && (h_in_end < input_height) && (w_in_start >= 0) && const int h_end = h_in_end < input_height ? h_in_end : input_height;
(w_in_end < input_width)) { const int w_end = w_in_end < input_width ? w_in_end : input_width;
for (int kh = 0; kh < filter_height; ++kh) { const int h_start = h_in_start > 0 ? h_in_start : 0;
for (int kw = 0; kw < filter_width; ++kw) { const int w_start = w_in_start > 0 ? w_in_start : 0;
const int h_in = h_in_start + kh;
const int w_in = w_in_start + kw; for (int h_in = h_start; h_in < h_end; h_in++) {
const int offset = in_offset + h_in * input_width + w_in; for (int w_in = w_start; w_in < w_end; w_in++) {
const int offset = in_offset + h_in * input_width + w_in;
value += (*weight) * input_data[offset]; value +=
++weight; weight[(h_in - h_in_start) * filter_width + (w_in - w_in_start)] *
} input_data[offset];
}
} else {
for (int kh = 0; kh < filter_height; ++kh) {
for (int kw = 0; kw < filter_width; ++kw) {
const int h_in = h_in_start + kh;
const int w_in = w_in_start + kw;
if ((h_in >= 0) && (h_in < input_height) && (w_in >= 0) &&
(w_in < input_width)) {
const int offset = in_offset + h_in * input_width + w_in;
value += (*weight) * input_data[offset];
}
++weight;
}
} }
} }
output_data[index] = value; output_data[index] = value;
...@@ -162,32 +147,18 @@ __global__ void KernelDepthwiseConvFilterGrad( ...@@ -162,32 +147,18 @@ __global__ void KernelDepthwiseConvFilterGrad(
(batch * input_channels + c_in) * input_height * input_width; (batch * input_channels + c_in) * input_height * input_width;
T* addr_offset = filter_grad_data + c_out * filter_height * filter_width; T* addr_offset = filter_grad_data + c_out * filter_height * filter_width;
const int h_end = h_in_end < input_height ? h_in_end : input_height;
if ((h_in_start >= 0) && (h_in_end < input_height) && (w_in_start >= 0) && const int w_end = w_in_end < input_width ? w_in_end : input_width;
(w_in_end < input_width)) { const int h_start = h_in_start > 0 ? h_in_start : 0;
for (int kw = 0; kw < filter_width; kw++) { const int w_start = w_in_start > 0 ? w_in_start : 0;
for (int kh = 0; kh < filter_height; kh++) {
const int h_in = h_in_start + kh; for (int h_in = h_start; h_in < h_end; h_in++) {
const int w_in = w_in_start + kw; for (int w_in = w_start; w_in < w_end; w_in++) {
const int offset = in_offset + h_in * input_width + w_in; const int offset = in_offset + h_in * input_width + w_in;
const T diff_temp = output_grad_data[index] * input_data[offset]; const T diff_temp = output_grad_data[index] * input_data[offset];
T* addr = addr_offset + kh * filter_width + kw; T* addr = addr_offset + (h_in - h_in_start) * filter_width +
paddle::platform::CudaAtomicAdd(addr, diff_temp); (w_in - w_in_start);
} paddle::platform::CudaAtomicAdd(addr, diff_temp);
}
} else {
for (int kw = 0; kw < filter_width; kw++) {
for (int kh = 0; kh < filter_height; kh++) {
const int h_in = h_in_start + kh;
const int w_in = w_in_start + kw;
if ((h_in >= 0) && (h_in < input_height) && (w_in >= 0) &&
(w_in < input_width)) {
const int offset = in_offset + h_in * input_width + w_in;
const T diff_temp = output_grad_data[index] * input_data[offset];
T* addr = addr_offset + kh * filter_width + kw;
paddle::platform::CudaAtomicAdd(addr, diff_temp);
}
}
} }
} }
} }
......
...@@ -1237,7 +1237,7 @@ def conv2d(input, ...@@ -1237,7 +1237,7 @@ def conv2d(input,
l_type = 'conv2d' l_type = 'conv2d'
if (num_channels == groups and num_filters % num_channels == 0 and if (num_channels == groups and num_filters % num_channels == 0 and
not use_cudnn): not use_cudnn):
l_type = 'depthwise_conv' l_type = 'depthwise_conv2d'
helper = LayerHelper(l_type, **locals()) helper = LayerHelper(l_type, **locals())
dtype = helper.input_dtype() dtype = helper.input_dtype()
......
...@@ -250,7 +250,7 @@ class TestDepthwiseConv(TestConv2dOp): ...@@ -250,7 +250,7 @@ class TestDepthwiseConv(TestConv2dOp):
assert np.mod(self.input_size[1], self.groups) == 0 assert np.mod(self.input_size[1], self.groups) == 0
f_c = self.input_size[1] / self.groups f_c = self.input_size[1] / self.groups
self.filter_size = [6, f_c, 3, 3] self.filter_size = [6, f_c, 3, 3]
self.op_type = "depthwise_conv" self.op_type = "depthwise_conv2d"
class TestDepthwiseConv2(TestConv2dOp): class TestDepthwiseConv2(TestConv2dOp):
...@@ -262,7 +262,7 @@ class TestDepthwiseConv2(TestConv2dOp): ...@@ -262,7 +262,7 @@ class TestDepthwiseConv2(TestConv2dOp):
assert np.mod(self.input_size[1], self.groups) == 0 assert np.mod(self.input_size[1], self.groups) == 0
f_c = self.input_size[1] / self.groups f_c = self.input_size[1] / self.groups
self.filter_size = [6, f_c, 3, 3] self.filter_size = [6, f_c, 3, 3]
self.op_type = "depthwise_conv" self.op_type = "depthwise_conv2d"
# cudnn v5 does not support dilation conv. # cudnn v5 does not support dilation conv.
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册