提交 6e17babe 编写于 作者: X xzl

More efficient, add check on python side

上级 b5ea0483
...@@ -159,7 +159,6 @@ if (WITH_GPU) ...@@ -159,7 +159,6 @@ if (WITH_GPU)
op_library(conv_op SRCS conv_op.cc conv_op.cu.cc conv_cudnn_op.cu.cc DEPS op_library(conv_op SRCS conv_op.cc conv_op.cu.cc conv_cudnn_op.cu.cc DEPS
vol2col depthwise_conv) vol2col depthwise_conv)
# op_library(conv_op SRCS conv_op.cc conv_op.cu.cc conv_cudnn_op.cu.cc DEPS vol2col)
op_library(edit_distance_op SRCS edit_distance_op.cc edit_distance_op.cu DEPS math_function) op_library(edit_distance_op SRCS edit_distance_op.cc edit_distance_op.cu DEPS math_function)
op_library(pool_op SRCS pool_op.cc pool_op.cu.cc pool_cudnn_op.cu.cc DEPS pooling) op_library(pool_op SRCS pool_op.cc pool_op.cu.cc pool_cudnn_op.cu.cc DEPS pooling)
op_library(conv_transpose_op SRCS conv_transpose_op.cc conv_transpose_op.cu.cc op_library(conv_transpose_op SRCS conv_transpose_op.cc conv_transpose_op.cu.cc
......
...@@ -46,16 +46,18 @@ __global__ void KernelDepthwiseConv( ...@@ -46,16 +46,18 @@ __global__ void KernelDepthwiseConv(
-padding_height + h_out * stride_height + filter_height - 1; -padding_height + h_out * stride_height + filter_height - 1;
const int w_in_end = const int w_in_end =
-padding_width + w_out * stride_width + filter_width - 1; -padding_width + w_out * stride_width + filter_width - 1;
const int in_offset =
((batch * input_channels + c_in) * input_height) * input_width;
if ((h_in_start >= 0) && (h_in_end < input_height) && (w_in_start >= 0) && if ((h_in_start >= 0) && (h_in_end < input_height) && (w_in_start >= 0) &&
(w_in_end < input_width)) { (w_in_end < input_width)) {
for (int kh = 0; kh < filter_height; ++kh) { for (int kh = 0; kh < filter_height; ++kh) {
for (int kw = 0; kw < filter_width; ++kw) { for (int kw = 0; kw < filter_width; ++kw) {
const int h_in = -padding_height + h_out * stride_height + kh; const int h_in = h_in_start + kh;
const int w_in = -padding_width + w_out * stride_width + kw; const int w_in = w_in_start + kw;
const int offset = const int offset = in_offset + h_in * input_width + w_in;
((batch * input_channels + c_in) * input_height + h_in) *
input_width +
w_in;
value += (*weight) * input_data[offset]; value += (*weight) * input_data[offset];
++weight; ++weight;
} }
...@@ -63,14 +65,11 @@ __global__ void KernelDepthwiseConv( ...@@ -63,14 +65,11 @@ __global__ void KernelDepthwiseConv(
} else { } else {
for (int kh = 0; kh < filter_height; ++kh) { for (int kh = 0; kh < filter_height; ++kh) {
for (int kw = 0; kw < filter_width; ++kw) { for (int kw = 0; kw < filter_width; ++kw) {
const int h_in = -padding_height + h_out * stride_height + kh; const int h_in = h_in_start + kh;
const int w_in = -padding_width + w_out * stride_width + kw; const int w_in = w_in_start + kw;
if ((h_in >= 0) && (h_in < input_height) && (w_in >= 0) && if ((h_in >= 0) && (h_in < input_height) && (w_in >= 0) &&
(w_in < input_width)) { (w_in < input_width)) {
const int offset = const int offset = in_offset + h_in * input_width + w_in;
((batch * input_channels + c_in) * input_height + h_in) *
input_width +
w_in;
value += (*weight) * input_data[offset]; value += (*weight) * input_data[offset];
} }
++weight; ++weight;
...@@ -159,36 +158,33 @@ __global__ void KernelDepthwiseConvFilterGrad( ...@@ -159,36 +158,33 @@ __global__ void KernelDepthwiseConvFilterGrad(
const int h_in_end = const int h_in_end =
-padding_height + h_out * stride_height + filter_height; -padding_height + h_out * stride_height + filter_height;
const int w_in_end = -padding_width + w_out * stride_width + filter_width; const int w_in_end = -padding_width + w_out * stride_width + filter_width;
const int in_offset =
(batch * input_channels + c_in) * input_height * input_width;
T* addr_offset = filter_grad_data + c_out * filter_height * filter_width;
if ((h_in_start >= 0) && (h_in_end < input_height) && (w_in_start >= 0) && if ((h_in_start >= 0) && (h_in_end < input_height) && (w_in_start >= 0) &&
(w_in_end < input_width)) { (w_in_end < input_width)) {
for (int kw = 0; kw < filter_width; kw++) { for (int kw = 0; kw < filter_width; kw++) {
for (int kh = 0; kh < filter_height; kh++) { for (int kh = 0; kh < filter_height; kh++) {
const int h_in = -padding_height + h_out * stride_height + kh; const int h_in = h_in_start + kh;
const int w_in = -padding_width + w_out * stride_width + kw; const int w_in = w_in_start + kw;
const int offset = const int offset = in_offset + h_in * input_width + w_in;
((batch * input_channels + c_in) * input_height + h_in) *
input_width +
w_in;
const T diff_temp = output_grad_data[index] * input_data[offset]; const T diff_temp = output_grad_data[index] * input_data[offset];
T* addr = filter_grad_data + c_out * filter_height * filter_width + T* addr = addr_offset + kh * filter_width + kw;
kh * filter_width + kw;
paddle::platform::CudaAtomicAdd(addr, diff_temp); paddle::platform::CudaAtomicAdd(addr, diff_temp);
} }
} }
} else { } else {
for (int kw = 0; kw < filter_width; kw++) { for (int kw = 0; kw < filter_width; kw++) {
for (int kh = 0; kh < filter_height; kh++) { for (int kh = 0; kh < filter_height; kh++) {
const int h_in = -padding_height + h_out * stride_height + kh; const int h_in = h_in_start + kh;
const int w_in = -padding_width + w_out * stride_width + kw; const int w_in = w_in_start + kw;
if ((h_in >= 0) && (h_in < input_height) && (w_in >= 0) && if ((h_in >= 0) && (h_in < input_height) && (w_in >= 0) &&
(w_in < input_width)) { (w_in < input_width)) {
const int offset = const int offset = in_offset + h_in * input_width + w_in;
((batch * input_channels + c_in) * input_height + h_in) *
input_width +
w_in;
const T diff_temp = output_grad_data[index] * input_data[offset]; const T diff_temp = output_grad_data[index] * input_data[offset];
T* addr = filter_grad_data + c_out * filter_height * filter_width + T* addr = addr_offset + kh * filter_width + kw;
kh * filter_width + kw;
paddle::platform::CudaAtomicAdd(addr, diff_temp); paddle::platform::CudaAtomicAdd(addr, diff_temp);
} }
} }
......
...@@ -1013,7 +1013,8 @@ def conv2d(input, ...@@ -1013,7 +1013,8 @@ def conv2d(input,
num_channels = input.shape[1] num_channels = input.shape[1]
l_type = 'conv2d' l_type = 'conv2d'
if num_channels == groups and not use_cudnn: if (num_channels == groups and num_filters % num_channels == 0 and
not use_cudnn):
l_type = 'depthwise_conv' l_type = 'depthwise_conv'
helper = LayerHelper(l_type, **locals()) helper = LayerHelper(l_type, **locals())
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册