From ed857585b82823838e7557cded50e6033b55d31b Mon Sep 17 00:00:00 2001 From: xiaoxiaohehe001 <49090790+xiaoxiaohehe001@users.noreply.github.com> Date: Thu, 28 Jul 2022 17:10:53 +0800 Subject: [PATCH] [Paddle Inference] Support depthwise_conv2d fp16. (#44642) * depthwise_fp16 * depthwise_fp16 * depthwise_fp16 * depthwise_fp16 --- paddle/phi/kernels/gpu/depthwise_conv.h | 59 ++++++++++++------- .../kernels/gpu/depthwise_conv_grad_kernel.cu | 3 +- .../phi/kernels/gpu/depthwise_conv_kernel.cu | 3 +- 3 files changed, 41 insertions(+), 24 deletions(-) diff --git a/paddle/phi/kernels/gpu/depthwise_conv.h b/paddle/phi/kernels/gpu/depthwise_conv.h index 8586c56c560..eae7b775199 100644 --- a/paddle/phi/kernels/gpu/depthwise_conv.h +++ b/paddle/phi/kernels/gpu/depthwise_conv.h @@ -153,7 +153,7 @@ __device__ __inline__ void KernelDepthwiseConvNCHW( const int c_in = c_out / filter_multiplier; const T* weight = filter_data + c_out * filter_height * filter_width; - T value = 0; + T value(0); const int h_in_start = -padding_height + h_out * stride_height; const int w_in_start = -padding_width + w_out * stride_width; const int h_in_end = h_in_start + filter_height * dilate_height; @@ -176,7 +176,7 @@ __device__ __inline__ void KernelDepthwiseConvNCHW( int offset = in_offset + h_in * input_width + w_in; T in_data = input_data[offset]; if (fuse_relu_before_conv) { - value += weight[weight_offset] * max(0.0f, in_data); + value += weight[weight_offset] * T(max(0.0f, double(in_data))); } else { value += weight[weight_offset] * in_data; } @@ -205,7 +205,7 @@ __device__ __inline__ void KernelDepthwiseConvNHWC( const int batch = idx / output_width / output_height / output_channels; const int c_in = c_out / filter_multiplier; - T value = 0; + T value(0); const int h_in_start = -padding_height + h_out * stride_height; const int w_in_start = -padding_width + w_out * stride_width; const int h_in_end = h_in_start + filter_height * dilate_height; @@ -228,7 +228,7 @@ __device__ __inline__ void KernelDepthwiseConvNHWC( T in_data = input_data[offset]; const T* weight = filter_data + weight_offset * output_channels + c_out; if (fuse_relu_before_conv) { - value += weight[0] * max(0.0f, in_data); + value += weight[0] * T(max(0.0f, double(in_data))); } else { value += weight[0] * in_data; } @@ -258,7 +258,7 @@ __device__ __inline__ void KernelDepthwiseConvCFilterNCHW( const int c_out = blockIdx.x; const int c_in = c_out / filter_multiplier; - T value = 0; + T value(0); const int h_in_start = -padding_height + h_out * stride_height; const int w_in_start = -padding_width + w_out * stride_width; const int h_in_end = h_in_start + c_filter * dilate_height; @@ -281,7 +281,7 @@ __device__ __inline__ void KernelDepthwiseConvCFilterNCHW( int offset = in_offset + h_in * input_width + w_in; if (fuse_relu_before_conv) { value += r_weight[h_f * c_filter + w_f] * - max(0.0f, input_data[offset]); + T(max(0.0f, double(input_data[offset]))); } else { value += r_weight[h_f * c_filter + w_f] * input_data[offset]; } @@ -325,7 +325,7 @@ __device__ __inline__ void KernelDepthwiseConvCFilterNHWC( if (w_out >= output_width) { continue; } - T value = 0; + T value(0); const int w_in_start = -padding_width + w_out * stride_width; for (int h_in = h_in_start, h_f = 0; h_f < c_filter; h_in += dilate_height, h_f++) { @@ -337,7 +337,7 @@ __device__ __inline__ void KernelDepthwiseConvCFilterNHWC( in_offset + (h_in * input_width + w_in) * input_channels + c_in; if (fuse_relu_before_conv) { value += r_weight[h_f * c_filter + w_f] * - max(0.0f, input_data[offset]); + T(max(0.0, double(input_data[offset]))); } else { value += r_weight[h_f * c_filter + w_f] * input_data[offset]; } @@ -482,13 +482,13 @@ __device__ __inline__ void KernelDepthwiseConvInputGradNCHW( w_in - (filter_width - 1) * dilate_width + padding_width; int w_out_end = w_in + padding_width; - T value = 0; + T value(0); int index = ((batch * gridDim.x + c_in) * input_height + h_in) * input_width + w_in; if (fuse_relu_before_conv) { - if (input_data[index] <= 0) { + if (input_data[index] <= T(0)) { input_grad_data[index] = 0; continue; } @@ -539,12 +539,12 @@ __device__ __inline__ void KernelDepthwiseConvInputGradNHWC( int w_out_start = w_in - (filter_width - 1) * dilate_width + padding_width; - T value = 0; + T value(0); int index = ((batch * input_height + h_in) * input_width + w_in) * input_channels + c_in; if (fuse_relu_before_conv) { - if (input_data[index] <= 0) { + if (input_data[index] <= T(0)) { input_grad_data[index] = 0; continue; } @@ -603,12 +603,12 @@ __device__ __inline__ void KernelDepthwiseConvInputGradCFilterNCHW( int h_out_start = h_in - (c_filter - 1) * dilate_height + padding_height; int w_out_start = w_in - (c_filter - 1) * dilate_width + padding_width; - T value = 0; + T value(0); int index = ((batch * gridDim.x + c_in) * input_height + h_in) * input_width + w_in; if (fuse_relu_before_conv) { - if (input_data[index] <= 0) { + if (input_data[index] <= T(0)) { input_grad_data[index] = 0; continue; } @@ -676,12 +676,12 @@ __device__ __inline__ void KernelDepthwiseConvInputGradCFilterNHWC( } int w_out_start = w_in - (c_filter - 1) * dilate_width + padding_width; - T value = 0; + T value(0); int index = ((batch * input_height + h_in) * input_width + w_in) * input_channels + c_in; if (fuse_relu_before_conv) { - if (input_data[index] <= 0) { + if (input_data[index] <= T(0)) { input_grad_data[index] = 0; continue; } @@ -854,7 +854,7 @@ __device__ __inline__ void KernelDepthwiseConvFilterGradNCHW( const int dilate_height, const int dilate_width, T* filter_grad_data) { - T s = 0; + T s(0); int gbid = ((blockIdx.z * gridDim.y) + blockIdx.y) * gridDim.x + blockIdx.x; for (int image_w = threadIdx.x; image_w < output_width; @@ -880,7 +880,7 @@ __device__ __inline__ void KernelDepthwiseConvFilterGradNCHW( image_wk; if (fuse_relu_before_conv) { s += output_grad_data[gaid(bid, kernel_id, image_h, image_w)] * - max(0.0f, input_data[input_id]); + T(max(0.0f, double(input_data[input_id]))); } else { s += output_grad_data[gaid(bid, kernel_id, image_h, image_w)] * input_data[input_id]; @@ -921,7 +921,7 @@ __device__ __inline__ void KernelDepthwiseConvFilterGradNHWC( int kernel_ih = blockIdx.x / filter_width; for (int kernel_id = threadIdx.x; kernel_id < output_channels; kernel_id += blockDim.x) { - T s = 0; + T s(0); int gbid = ((kernel_id * filter_height) + kernel_ih) * filter_width + kernel_iw; for (int image_w = threadIdx.y; image_w < output_width; @@ -941,7 +941,7 @@ __device__ __inline__ void KernelDepthwiseConvFilterGradNHWC( kernel_id / filter_multiplier; if (fuse_relu_before_conv) { s += output_grad_data[gaid(bid, image_h, image_w, kernel_id)] * - max(0.0f, input_data[input_id]); + T(max(0.0f, double(input_data[input_id]))); } else { s += output_grad_data[gaid(bid, image_h, image_w, kernel_id)] * input_data[input_id]; @@ -1010,9 +1010,10 @@ __device__ __inline__ void KernelDepthwiseConvFilterGradCFilterNHWC( ((bid * output_height + image_h) * output_width + image_w) * output_channels + kernel_id; - T s = 0; + T s(0); if (fuse_relu_before_conv) { - s = output_grad_data[output_id] * max(0.0f, input_data[input_id]); + s = output_grad_data[output_id] * + T(max(0.0f, double(input_data[input_id]))); } else { s = output_grad_data[output_id] * input_data[input_id]; } @@ -1672,21 +1673,35 @@ class DepthwiseConvFilterGradFunctor; template class DepthwiseConvFunctor; +template class DepthwiseConvFunctor; template class DepthwiseConvInputGradFunctor; template class DepthwiseConvInputGradFunctor; +template class DepthwiseConvInputGradFunctor; template class DepthwiseConvFilterGradFunctor; template class DepthwiseConvFilterGradFunctor; +template class DepthwiseConvFilterGradFunctor; template class DepthwiseConvFunctor; template class DepthwiseConvFunctor; +template class DepthwiseConvFunctor; template class DepthwiseConvInputGradFunctor; template class DepthwiseConvInputGradFunctor; +template class DepthwiseConvInputGradFunctor; template class DepthwiseConvFilterGradFunctor; template class DepthwiseConvFilterGradFunctor; +template class DepthwiseConvFilterGradFunctor; } // namespace math } // namespace operators diff --git a/paddle/phi/kernels/gpu/depthwise_conv_grad_kernel.cu b/paddle/phi/kernels/gpu/depthwise_conv_grad_kernel.cu index 5fc5482a080..9ab59bf4c68 100644 --- a/paddle/phi/kernels/gpu/depthwise_conv_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/depthwise_conv_grad_kernel.cu @@ -139,4 +139,5 @@ PD_REGISTER_KERNEL(depthwise_conv2d_grad, ALL_LAYOUT, phi::DepthwiseConvGradKernel, float, - double) {} + double, + phi::dtype::float16) {} diff --git a/paddle/phi/kernels/gpu/depthwise_conv_kernel.cu b/paddle/phi/kernels/gpu/depthwise_conv_kernel.cu index 7310883e595..7fd482e6591 100644 --- a/paddle/phi/kernels/gpu/depthwise_conv_kernel.cu +++ b/paddle/phi/kernels/gpu/depthwise_conv_kernel.cu @@ -124,4 +124,5 @@ PD_REGISTER_KERNEL(depthwise_conv2d, ALL_LAYOUT, phi::DepthwiseConvKernel, float, - double) {} + double, + phi::dtype::float16) {} -- GitLab