diff --git a/paddle/phi/kernels/gpu/depthwise_conv.h b/paddle/phi/kernels/gpu/depthwise_conv.h index bcc7959fddee93e916c1ea39ef4ee5e24b978bd1..3630634dd3b8923c6eec7ee91ec9e359338edcde 100644 --- a/paddle/phi/kernels/gpu/depthwise_conv.h +++ b/paddle/phi/kernels/gpu/depthwise_conv.h @@ -139,56 +139,53 @@ __forceinline__ __device__ T BlockReduceSum(T val) { // A Cuda kernel to compute the depthwise convolution forward pass // in NCHW format. -template +template __device__ __inline__ void KernelDepthwiseConvNCHW( ARG_DEFINE_KernelDepthwiseConv) { + const int fw_size = c_filter != -1 ? c_filter : filter_width; + const int fh_size = c_filter != -1 ? c_filter : filter_height; int idx = threadIdx.x + blockIdx.x * blockDim.x; if (idx >= (output_channels * batch_size * output_height * output_width)) return; - const int w_out = idx % output_width; - const int h_out = (idx / output_width) % output_height; - const int c_out = (idx / output_width / output_height) % output_channels; - const int batch = idx / output_width / output_height / output_channels; + int tmp_1 = idx / output_width; + const int w_out = idx - tmp_1 * output_width; + int tmp_2 = tmp_1 / output_height; + const int h_out = tmp_1 - tmp_2 * output_height; + tmp_1 = tmp_2; + tmp_2 = tmp_1 / output_channels; + const int c_out = tmp_1 - tmp_2 * output_channels; + const int batch = tmp_2; const int c_in = c_out / filter_multiplier; - const T* weight = filter_data + c_out * filter_height * filter_width; T value(0); - const int h_in_start = -padding_height + h_out * stride_height; - const int w_in_start = -padding_width + w_out * stride_width; - const int h_in_end = h_in_start + filter_height * dilate_height; - const int w_in_end = w_in_start + filter_width * dilate_width; int in_offset = ((batch * input_channels + c_in) * input_height) * input_width; - - const int h_end = h_in_end < input_height ? h_in_end : input_height; - const int w_end = w_in_end < input_width ? w_in_end : input_width; - const int h_start = h_in_start > 0 ? h_in_start : 0; - const int w_start = w_in_start > 0 ? w_in_start : 0; - int weight_offset = 0; + int weight_offset = c_out * filter_height * filter_width; + int h_in_start = -padding_height + h_out * stride_height; + int w_in_start = -padding_width + w_out * stride_width; #pragma unroll - for (int h_in = h_in_start; h_in < h_in_end; h_in += dilate_height) { + for (int fh = 0, h_in = h_in_start; fh < fh_size; + fh++, h_in += dilate_height) { #pragma unroll - for (int w_in = w_in_start; w_in < w_in_end; w_in += dilate_width) { - if (h_in >= h_start && h_in < h_end && w_in >= w_start && w_in < w_end) { + for (int fw = 0, w_in = w_in_start; fw < fw_size; + fw++, w_in += dilate_width) { + if (h_in >= 0 && h_in < input_height && w_in >= 0 && w_in < input_width) { int offset = in_offset + h_in * input_width + w_in; T in_data = input_data[offset]; if (fuse_relu_before_conv) { - value += weight[weight_offset] * - T(max(0.0f, static_cast(in_data))); + value += filter_data[weight_offset] * + static_cast(max(0.0f, static_cast(in_data))); } else { - value += weight[weight_offset] * in_data; + value += filter_data[weight_offset] * in_data; } } weight_offset++; } } - int index = batch * output_channels * output_height * output_width + - c_out * output_height * output_width + h_out * output_width + - w_out; - output_data[index] = value; + output_data[idx] = value; } // A Cuda kernel to compute the depthwise convolution forward pass @@ -229,7 +226,8 @@ __device__ __inline__ void KernelDepthwiseConvNHWC( T in_data = input_data[offset]; const T* weight = filter_data + weight_offset * output_channels + c_out; if (fuse_relu_before_conv) { - value += weight[0] * T(max(0.0f, static_cast(in_data))); + value += weight[0] * + static_cast(max(0.0f, static_cast(in_data))); } else { value += weight[0] * in_data; } @@ -282,7 +280,8 @@ __device__ __inline__ void KernelDepthwiseConvCFilterNCHW( int offset = in_offset + h_in * input_width + w_in; if (fuse_relu_before_conv) { value += r_weight[h_f * c_filter + w_f] * - T(max(0.0f, static_cast(input_data[offset]))); + static_cast( + max(0.0f, static_cast(input_data[offset]))); } else { value += r_weight[h_f * c_filter + w_f] * input_data[offset]; } @@ -338,7 +337,8 @@ __device__ __inline__ void KernelDepthwiseConvCFilterNHWC( in_offset + (h_in * input_width + w_in) * input_channels + c_in; if (fuse_relu_before_conv) { value += r_weight[h_f * c_filter + w_f] * - T(max(0.0, static_cast(input_data[offset]))); + static_cast( + max(0.0, static_cast(input_data[offset]))); } else { value += r_weight[h_f * c_filter + w_f] * input_data[offset]; } @@ -368,25 +368,26 @@ __global__ void KernelDepthwiseConvSp(ARG_DEFINE_KernelDepthwiseConv) { } if (c_filter == -1) { if (data_layout != DataLayout::kNHWC) { - KernelDepthwiseConvNCHW(input_data, - filter_data, - batch_size, - output_channels, - output_height, - output_width, - input_channels, - input_height, - input_width, - final_filter_multiplier, - filter_height, - filter_width, - h_stride, - w_stride, - padding_height, - padding_width, - dilate_height, - dilate_width, - output_data); + KernelDepthwiseConvNCHW( + input_data, + filter_data, + batch_size, + output_channels, + output_height, + output_width, + input_channels, + input_height, + input_width, + final_filter_multiplier, + filter_height, + filter_width, + h_stride, + w_stride, + padding_height, + padding_width, + dilate_height, + dilate_width, + output_data); } else { KernelDepthwiseConvNHWC(input_data, filter_data, @@ -881,7 +882,8 @@ __device__ __inline__ void KernelDepthwiseConvFilterGradNCHW( image_wk; if (fuse_relu_before_conv) { s += output_grad_data[gaid(bid, kernel_id, image_h, image_w)] * - T(max(0.0f, static_cast(input_data[input_id]))); + static_cast( + max(0.0f, static_cast(input_data[input_id]))); } else { s += output_grad_data[gaid(bid, kernel_id, image_h, image_w)] * input_data[input_id]; @@ -942,7 +944,8 @@ __device__ __inline__ void KernelDepthwiseConvFilterGradNHWC( kernel_id / filter_multiplier; if (fuse_relu_before_conv) { s += output_grad_data[gaid(bid, image_h, image_w, kernel_id)] * - T(max(0.0f, static_cast(input_data[input_id]))); + static_cast( + max(0.0f, static_cast(input_data[input_id]))); } else { s += output_grad_data[gaid(bid, image_h, image_w, kernel_id)] * input_data[input_id]; @@ -1014,7 +1017,8 @@ __device__ __inline__ void KernelDepthwiseConvFilterGradCFilterNHWC( T s(0); if (fuse_relu_before_conv) { s = output_grad_data[output_id] * - T(max(0.0f, static_cast(input_data[input_id]))); + static_cast( + max(0.0f, static_cast(input_data[input_id]))); } else { s = output_grad_data[output_id] * input_data[input_id]; }