提交 2ffa3a8b 编写于 作者: X xzl

rename op to depthwise_conv2d, more efficient

上级 fc9b2b9a
......@@ -320,20 +320,20 @@ REGISTER_OP(conv2d, ops::ConvOp, ops::Conv2DOpMaker, conv2d_grad,
ops::ConvOpGrad);
// depthwise convolution op
REGISTER_OP(depthwise_conv, ops::ConvOp, ops::Conv2DOpMaker,
depthwise_conv_grad, ops::ConvOpGrad);
REGISTER_OP(depthwise_conv2d, ops::ConvOp, ops::Conv2DOpMaker,
depthwise_conv2d_grad, ops::ConvOpGrad);
REGISTER_OP(conv3d, ops::ConvOp, ops::Conv3DOpMaker, conv3d_grad,
ops::ConvOpGrad);
// depthwise conv kernel
// TODO(xingzhaolong): neon kernel for mobile
REGISTER_OP_CPU_KERNEL(
depthwise_conv,
depthwise_conv2d,
ops::GemmConvKernel<paddle::platform::CPUDeviceContext, float>,
ops::GemmConvKernel<paddle::platform::CPUDeviceContext, double>);
REGISTER_OP_CPU_KERNEL(
depthwise_conv_grad,
depthwise_conv2d_grad,
ops::GemmConvGradKernel<paddle::platform::CPUDeviceContext, float>,
ops::GemmConvGradKernel<paddle::platform::CPUDeviceContext, double>);
......
......@@ -17,12 +17,12 @@ limitations under the License. */
namespace ops = paddle::operators;
REGISTER_OP_CUDA_KERNEL(
depthwise_conv,
depthwise_conv2d,
ops::DepthwiseConvKernel<paddle::platform::CUDADeviceContext, float>,
ops::DepthwiseConvKernel<paddle::platform::CUDADeviceContext, double>);
REGISTER_OP_CUDA_KERNEL(
depthwise_conv_grad,
depthwise_conv2d_grad,
ops::DepthwiseConvGradKernel<paddle::platform::CUDADeviceContext, float>,
ops::DepthwiseConvGradKernel<paddle::platform::CUDADeviceContext, double>);
......
......@@ -42,38 +42,23 @@ __global__ void KernelDepthwiseConv(
T value = 0;
const int h_in_start = -padding_height + h_out * stride_height;
const int w_in_start = -padding_width + w_out * stride_width;
const int h_in_end =
-padding_height + h_out * stride_height + filter_height - 1;
const int w_in_end =
-padding_width + w_out * stride_width + filter_width - 1;
const int h_in_end = h_in_start + filter_height;
const int w_in_end = w_in_start + filter_width;
const int in_offset =
((batch * input_channels + c_in) * input_height) * input_width;
if ((h_in_start >= 0) && (h_in_end < input_height) && (w_in_start >= 0) &&
(w_in_end < input_width)) {
for (int kh = 0; kh < filter_height; ++kh) {
for (int kw = 0; kw < filter_width; ++kw) {
const int h_in = h_in_start + kh;
const int w_in = w_in_start + kw;
const int offset = in_offset + h_in * input_width + w_in;
value += (*weight) * input_data[offset];
++weight;
}
}
} else {
for (int kh = 0; kh < filter_height; ++kh) {
for (int kw = 0; kw < filter_width; ++kw) {
const int h_in = h_in_start + kh;
const int w_in = w_in_start + kw;
if ((h_in >= 0) && (h_in < input_height) && (w_in >= 0) &&
(w_in < input_width)) {
const int offset = in_offset + h_in * input_width + w_in;
value += (*weight) * input_data[offset];
}
++weight;
}
const int h_end = h_in_end < input_height ? h_in_end : input_height;
const int w_end = w_in_end < input_width ? w_in_end : input_width;
const int h_start = h_in_start > 0 ? h_in_start : 0;
const int w_start = w_in_start > 0 ? w_in_start : 0;
for (int h_in = h_start; h_in < h_end; h_in++) {
for (int w_in = w_start; w_in < w_end; w_in++) {
const int offset = in_offset + h_in * input_width + w_in;
value +=
weight[(h_in - h_in_start) * filter_width + (w_in - w_in_start)] *
input_data[offset];
}
}
output_data[index] = value;
......@@ -162,32 +147,18 @@ __global__ void KernelDepthwiseConvFilterGrad(
(batch * input_channels + c_in) * input_height * input_width;
T* addr_offset = filter_grad_data + c_out * filter_height * filter_width;
if ((h_in_start >= 0) && (h_in_end < input_height) && (w_in_start >= 0) &&
(w_in_end < input_width)) {
for (int kw = 0; kw < filter_width; kw++) {
for (int kh = 0; kh < filter_height; kh++) {
const int h_in = h_in_start + kh;
const int w_in = w_in_start + kw;
const int offset = in_offset + h_in * input_width + w_in;
const T diff_temp = output_grad_data[index] * input_data[offset];
T* addr = addr_offset + kh * filter_width + kw;
paddle::platform::CudaAtomicAdd(addr, diff_temp);
}
}
} else {
for (int kw = 0; kw < filter_width; kw++) {
for (int kh = 0; kh < filter_height; kh++) {
const int h_in = h_in_start + kh;
const int w_in = w_in_start + kw;
if ((h_in >= 0) && (h_in < input_height) && (w_in >= 0) &&
(w_in < input_width)) {
const int offset = in_offset + h_in * input_width + w_in;
const T diff_temp = output_grad_data[index] * input_data[offset];
T* addr = addr_offset + kh * filter_width + kw;
paddle::platform::CudaAtomicAdd(addr, diff_temp);
}
}
const int h_end = h_in_end < input_height ? h_in_end : input_height;
const int w_end = w_in_end < input_width ? w_in_end : input_width;
const int h_start = h_in_start > 0 ? h_in_start : 0;
const int w_start = w_in_start > 0 ? w_in_start : 0;
for (int h_in = h_start; h_in < h_end; h_in++) {
for (int w_in = w_start; w_in < w_end; w_in++) {
const int offset = in_offset + h_in * input_width + w_in;
const T diff_temp = output_grad_data[index] * input_data[offset];
T* addr = addr_offset + (h_in - h_in_start) * filter_width +
(w_in - w_in_start);
paddle::platform::CudaAtomicAdd(addr, diff_temp);
}
}
}
......
......@@ -1237,7 +1237,7 @@ def conv2d(input,
l_type = 'conv2d'
if (num_channels == groups and num_filters % num_channels == 0 and
not use_cudnn):
l_type = 'depthwise_conv'
l_type = 'depthwise_conv2d'
helper = LayerHelper(l_type, **locals())
dtype = helper.input_dtype()
......
......@@ -250,7 +250,7 @@ class TestDepthwiseConv(TestConv2dOp):
assert np.mod(self.input_size[1], self.groups) == 0
f_c = self.input_size[1] / self.groups
self.filter_size = [6, f_c, 3, 3]
self.op_type = "depthwise_conv"
self.op_type = "depthwise_conv2d"
class TestDepthwiseConv2(TestConv2dOp):
......@@ -262,7 +262,7 @@ class TestDepthwiseConv2(TestConv2dOp):
assert np.mod(self.input_size[1], self.groups) == 0
f_c = self.input_size[1] / self.groups
self.filter_size = [6, f_c, 3, 3]
self.op_type = "depthwise_conv"
self.op_type = "depthwise_conv2d"
# cudnn v5 does not support dilation conv.
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册